diff --git a/docs/source/api.rst b/docs/source/api.rst index 537a6680..9e09f56f 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -10,4 +10,5 @@ API documentation api/splitters api/metrics api/bench - api/helpers \ No newline at end of file + api/helpers + api/structures \ No newline at end of file diff --git a/docs/source/api/featurizers.rst b/docs/source/api/featurizers.rst index 954b8b6f..048cb031 100644 --- a/docs/source/api/featurizers.rst +++ b/docs/source/api/featurizers.rst @@ -94,4 +94,21 @@ Host Guest featurization ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. automodule:: mofdscribe.featurizers.hostguest.host_guest_featurizer + :members: + + +Graph featurization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: mofdscribe.featurizers.graph.graph_featurizer + :members: + +.. automodule:: mofdscribe.featurizers.graph.dimensionality + :members: + + +Matminer adapter +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: mofdscribe.featurizers.matmineradapter :members: \ No newline at end of file diff --git a/docs/source/api/structures.rst b/docs/source/api/structures.rst new file mode 100644 index 00000000..dc6e7045 --- /dev/null +++ b/docs/source/api/structures.rst @@ -0,0 +1,9 @@ + +Structure inputs +------------------- + +MOF +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. automodule:: mofdscribe.mof + :members: diff --git a/docs/source/background.rst b/docs/source/background.rst index 6fa7e999..e19d032a 100644 --- a/docs/source/background.rst +++ b/docs/source/background.rst @@ -60,6 +60,17 @@ descriptor that only considers the local environment (of e.g., 3 atoms). For thi featurizers/global/* +Graph featurizers +-------------------- + +Graph featurizers are a special class of featurizers that are based on the structure graph. The structure graph is a periodic graph in which the edges are bonds and the nodes are atoms. + +.. toctree:: + :glob: + :maxdepth: 1 + + featurizers/graph/* + BU-centered featurizers ----------------------------- @@ -75,11 +86,13 @@ mofdscribe can compute descriptors that are BU-centred, for instance, using RDKi from matminer.featurizers.structure import SiteStatsFingerprint from pymatgen.core import Structure from mofdscribe.featurizers.bu import BUFeaturizer + from mofdscribe.mof import MOF + from mofdscribe.featurizers.matmineradapter import MatminerAdapter - base_feat = SiteStatsFingerprint(SOAP.from_preset("formation_energy")) - base_feat.fit([hkust_structure]) + base_feat = MatminerAdapter(SiteStatsFingerprint(SOAP.from_preset("formation_energy"))) + base_feat.fit([MOF(hkust_structure)]) featurizer = BUFeaturizer(base_feat, aggregations=("mean",)) - features = featurizer.featurize(structure=hkust_structure) + features = featurizer.featurize(structure=MOF(hkust_structure)) For this, you can either provide your building blocks that you extracted with any of the available tools, or use our integration with our `moffragmentor `_ package. In this case, we will fragment the MOF into its building blocks and then compute the features for each building block and let you choose how you want to aggregate them. @@ -100,15 +113,15 @@ This featurizer will automatically extract the host and guest structures from th .. code-block:: python from matminer.featurizers.structure.sites import SiteStatsFingerprint - + from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.featurizers.hostguest import HostGuestFeaturizer featurizer = HostGuestFeaturizer( - featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"), + featurizer=MatminerAdapter(SiteStatsFingerprint.from_preset("SOAP_formation_energy")), aggregations=("mean",), ) - featurizer.fit([structure]) - features = featurizer.featurize(structure) + featurizer.fit([mof]) + features = featurizer.featurize(mof) If you are interested in surface chemistry features, you might also find suitable featurizers in the `matminer `_ package. diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 713aa7da..96c1fa63 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -5,11 +5,12 @@ Extending and contributing to mofdscribe Implementing a new featurizer ----------------------------- -To implement a new featurizer, you typically need to create a new class that inherits from the :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. In this class, you need to implement three methods: -:py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` and :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation`. +To implement a new featurizer, you typically need to create a new class that inherits from the :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. In this class, you need to implement four methods: +:py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize`, :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` and :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation`. + +The main featurization logic happens in :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize`. +Your method should accept as input a :py:class:`~pymatgen.core.Structure` (:py:class:`~pymatgen.core.IStructure`, :py:class:`~pymatgen.core.Molecule`, :py:class:`~pymatgen.core.StructureGraph`) object and return a :py:class:`numpy.array`. The :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer._featurize` is supposed to extract the relevant object from a :py:object:`~mofdscribe.mof.MOF` object. -The main featurization logic happens in :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`. -Your method should accept as input a :py:class:`~pymatgen.core.Structure` object and return a :py:class:`numpy.array`. The :py:meth:`mofdscribe.featurizers.base.MOFBaseFeaturizer.feature_labels` method should return a list of strings that describe the features returned by :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize`. The number of feature names should match the number of features returned by :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.featurize` (i.e. the number of columns in the feature matrix). The :py:meth:`~mofdscribe.featurizers.base.MOFBaseFeaturizer.citation` method should return a list of strings of BibTeX citations for the featurizer. Generally, you also want to decorate your structure with the diff --git a/docs/source/featurizers/atom_centered/racs.rst b/docs/source/featurizers/atom_centered/racs.rst index 905030ad..5d2d5dc5 100644 --- a/docs/source/featurizers/atom_centered/racs.rst +++ b/docs/source/featurizers/atom_centered/racs.rst @@ -2,6 +2,8 @@ Revised autocorrelation functions (RACs) ............................................. +.. _RACS: + Revised autocorrelation functions have originally been proposed for metal-complexes [Janet2017]_. Autocorrelation functions have been widely used as compact, fixed-length descriptors and are defined as diff --git a/docs/source/featurizers/bu_centered/angles.rst b/docs/source/featurizers/bu_centered/angles.rst new file mode 100644 index 00000000..71d23d87 --- /dev/null +++ b/docs/source/featurizers/bu_centered/angles.rst @@ -0,0 +1,24 @@ +Angle-based description of BU shape +======================================= + +The following featurizers compute the angles between all pairs of atoms in a building block. +We always compute the angles A-COM-B, where COM is the center of mass of the building block. + +Given the distribution of the angles, we can compute fixed-length descriptors by either converting +the distribution to a histogram or computing some statistics (mean, standard deviation, etc.) of the distribution. + +.. featurizer:: PairWiseAngleHist + :id: PairWiseAngleHist + :considers_geometry: True + :considers_structure_graph: False + :encodes_chemistry: False + :scope: bu + :scalar: False + +.. featurizer:: PairWiseAngleStats + :id: PairWiseAngleStats + :considers_geometry: True + :considers_structure_graph: False + :encodes_chemistry: False + :scope: bu + :scalar: False diff --git a/docs/source/featurizers/bu_centered/num_hops.rst b/docs/source/featurizers/bu_centered/num_hops.rst new file mode 100644 index 00000000..597afb72 --- /dev/null +++ b/docs/source/featurizers/bu_centered/num_hops.rst @@ -0,0 +1,25 @@ +Shortest-Path Based Description of Building Blocks +====================================================== + +For certain targets, the proximity of connecting groups in a building unit +(e.g. carboxy groups) can be interesting features. + +One way to describe this generally is to compute the distribution +of shortest paths between special sites in the building units that +our ``moffragmentor`` package calls "binding sites" and "branching sites". + +.. featurizer:: BranchingNumHopFeaturizer + :id: BranchingNumHopFeaturizer + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: False + :scope: bu + :scalar: False + +.. featurizer:: BindingNumHopFeaturizer + :id: BindingNumHopFeaturizer + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: False + :scope: bu + :scalar: False diff --git a/docs/source/featurizers/bu_centered/racs.rst b/docs/source/featurizers/bu_centered/racs.rst new file mode 100644 index 00000000..23458eee --- /dev/null +++ b/docs/source/featurizers/bu_centered/racs.rst @@ -0,0 +1,21 @@ + +Revised autocorrelation functions (RACs) +............................................. + +See also :ref:`RACs `. + +This featurizer is a flavor of the :ref:`RACs ` featurizer, that can split the computation over user-defined atom groups and automatically determined communities. For determining communities, we use ``networkx``'s implementation of greedy modularity maximization [NewmanModularity]_. +This community detection often corresponds to chemically meaningful parts (often ring systems) of the structure (but does not require an explicit fragmentation algorithm). + +In mofdscribe, you can customize the encodings :math:`P` (using all properties that are available in our `element-coder `_ package) as well as the aggregation functions. + +.. featurizer:: ModularityCommunityCenteredRACS + :id: ModularityCommunityCenteredRACS + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: optionally + :scope: local + :scalar: False + :style: only-light + + Initially described in [Janet2017]_ for metal complexes, extended to MOFs in [Moosavi2021]_. diff --git a/docs/source/featurizers/global/size.rst b/docs/source/featurizers/global/size.rst new file mode 100644 index 00000000..1625e9cf --- /dev/null +++ b/docs/source/featurizers/global/size.rst @@ -0,0 +1,4 @@ +Extensive size descriptors +================================ + + diff --git a/docs/source/featurizers/graph/dimensionality.rst b/docs/source/featurizers/graph/dimensionality.rst new file mode 100644 index 00000000..ad8330af --- /dev/null +++ b/docs/source/featurizers/graph/dimensionality.rst @@ -0,0 +1,18 @@ +Dimensionality +.................. + +Returns the dimensionality of the structure. This measure is based on [LarsenDimensionality]_, where the structure graph is analyzed. + +In the case of MOFs, rod like structures are considered 1D, sheet like structures are considered 2D, and 3D structures are considered 3D. + +This can be interesting for the metal nodes, where the typical SBUs such as Cu-paddlewheels are 0D. However, many well-known MOFs such as MOF-74 have infinite rod nodes, that this featurizer would consider 1D. + +.. featurizer:: Dimensionality + :id: Dimensionality + :considers_geometry: False + :considers_structure_graph: True + :encodes_chemistry: false + :scope: global + :scalar: True + + Returns the dimensionality of the structure. This measure is based on [LarsenDimensionality]_, where the structure graph is analyzed. \ No newline at end of file diff --git a/docs/source/featurizers/host_guest/host_guest_aprdf.rst b/docs/source/featurizers/host_guest/host_guest_aprdf.rst index 2fba836e..89867792 100644 --- a/docs/source/featurizers/host_guest/host_guest_aprdf.rst +++ b/docs/source/featurizers/host_guest/host_guest_aprdf.rst @@ -5,6 +5,8 @@ This featurizer builds on the :ref:`APRDF` featurizer, but instead of using the correlations between all atoms, it only considers the ones between the guest and all host atoms (within some cutoff distance). +The APRDF is defined as: + .. math:: \operatorname{RDF}^{P}(R)=f \sum_{i, j}^{\text {all atom pairs }} P_{i} P_{j} \mathrm{e}^{-B\left(r_{i j}-R\right)^{2}} diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 4c9c5d9c..fb45d341 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -8,23 +8,12 @@ Featurizing a MOF .. code-block:: python from mofdscribe.chemistry.racs import RACS - from pymatgen.core import Structure + from mofdscribe.mof import MOF - s = Structure.from_file() + mof = MOF.from_file() featurizer = RACS() - features = featurizer.featurize(s) + features = featurizer.featurize(mof) -.. admonition:: mofdscribe base classes - :class: hint - - Most featurizers in mofdscribe inherit from :py:class:`~mofdscribe.featurizers.base.MOFBaseFeaturizer`. - This class can also handle the conversion to primitive cells if you pass :code:`primitive=True` to the - constructor. This can be useful to save computational time but also make it possible to, e.g., - use the :code:`sum` aggregation. - - To avoid re-computation of the primitive cell, you should use the :py:class:`~mofdscribe.featurizers.base.MOFMultipleFeaturizer` - for combining multiple featurizers. This will accept a keyword argument :code:`primitive=True` in the constructor - and then compute the primitive cell once and use it for all the featurizers. It is also easy to combine multiple featurizers into a single pipeline: @@ -32,21 +21,21 @@ It is also easy to combine multiple featurizers into a single pipeline: from mofdscribe.chemistry.racs import RACS from mofdscribe.pore.geometric_properties import PoreDiameters - from pymatgen.core import Structure + from mofdscribe.mof import MOF from mofdscribe.featurizers.base import MOFMultipleFeaturizer - s = Structure.from_file() + mof = MOF.from_file() featurizer = MOFMultipleFeaturizer([RACS(), PoreDiameters()]) - features = featurizer.featurize(s) + features = featurizer.featurize(mof) You can, of course, also pass multiple structures to the featurizer (and the featurization is automatically parallelized via matminer): .. code-block:: python - s = Structure.from_file() - s2 = Structure.from_file() - features = featurizer.featurize_many([s, s2]) + mof_1 = MOF.from_file() + mof_2 = MOF.from_file() + features = featurizer.featurize_many([mof_1, mof_2]) And, clearly, you can also use the `mofdscribe` featurizers alongside ones from `matminer`: @@ -55,9 +44,10 @@ And, clearly, you can also use the `mofdscribe` featurizers alongside ones from from matminer.featurizers.structure import LocalStructuralOrderParams from mofdscribe.chemistry.racs import RACS + from mofdscribe.featurizers.matmineradapter import MatminerAdapter - featurizer = MOFMultipleFeaturizer([RACS(), LocalStructuralOrderParams()]) - features = featurizer.featurize_many([s, s2]) + featurizer = MOFMultipleFeaturizer([RACS(), MatminerAdapter(LocalStructuralOrderParams())]) + features = featurizer.featurize_many([mof_2, mof_2]) If you use the :code:`zeo++` or :code:`raspa2` packages, you can customize the temporary @@ -72,6 +62,22 @@ directory. and notebook in the `examples folder `_. +.. admonition:: Saving time using the MOF object + :class: tip + + From our experience, the most time-consuming part of featurization is the + the computation of the structure graph or the fragments. + + Additionally, you often do not know in advance which featurizers you will + use. + + If you want to save in the case you need to compute additional features, + it can be practical to serialize the :py:class:`~mofdscribe.mof.MOF` objects + after the first featurization. + The objects will already contain the structure graph and the fragments (if + they have been computed in the first featurization). + + Using a reference dataset -------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 1a240b14..014a5ebb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -37,6 +37,7 @@ dependencies, the featurizers are currently not integrated in matminer itself. splitters datasets background + metrics leaderboard contributing api diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst new file mode 100644 index 00000000..f27ee506 --- /dev/null +++ b/docs/source/metrics.rst @@ -0,0 +1,14 @@ +Metrics +=================== + +In order to compare our models we need to score them using a metric. +Most commonly used in are scores such as accuracy, precision, recall, or the mean absolute error for regression problem. + +However, these metrics are not always the best choice. +It is well known, for instance, that accuracy is not a good metric for imbalanced datasets. +However, even beyond such considerations it is important to take into consideration for what purpose the model is used. + +For materials discovery, this often implies that a metric that measures how many of the top materials we find is more +important than an averaged, overall score. + +``mofdscribe`` provides some utilities to help with this in the ``mofdscribe.metrics`` subpackage. \ No newline at end of file diff --git a/docs/source/references.rst b/docs/source/references.rst index 2c1201fe..b959f0ff 100644 --- a/docs/source/references.rst +++ b/docs/source/references.rst @@ -112,3 +112,7 @@ References .. [Trappe] `Potoff, J. J.; Siepmann, J. I. Vapor–Liquid Equilibria of Mixtures Containing Alkanes, Carbon Dioxide, and Nitrogen. AIChE Journal 2001, 47 (7), 1676–1682. `_ .. [Varoquaux] `Varoquaux, G. Cross-Validation Failure: Small Sample Sizes Lead to Large Error Bars. NeuroImage 2018, 180, 68–77. `_ + +.. [LarsenDimensionality] `Larsen, P. M.; Pandey, M.; Strange, M.; Jacobsen, K. W. Definition of a Scoring Parameter to Identify Low-Dimensional Materials Components. Phys. Rev. Materials 2019, 3 (3), 034003. `_ + +.. [NewmanModularity] `Newman, M. E. J. Modularity and community structure in networks. Proceedings of the National Academy of Sciences 2006, 103 (23), 8577–8582. `_ \ No newline at end of file diff --git a/examples/build_model_using_mofdscribe.ipynb b/examples/build_model_using_mofdscribe.ipynb index a47f2ce5..94093bd8 100644 --- a/examples/build_model_using_mofdscribe.ipynb +++ b/examples/build_model_using_mofdscribe.ipynb @@ -54,7 +54,8 @@ "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n", "from mofdscribe.datasets.thermal_stability_dataset import ThermalStabilityDataset\n", "from mofdscribe.datasets.structuredataset import FrameDataset\n", - "from mofdscribe.splitters import HashSplitter" + "from mofdscribe.splitters import HashSplitter\n", + "from mofdscribe.mof import MOF" ] }, { @@ -19021,7 +19022,7 @@ ], "source": [ "feats = featurizer.featurize_many(\n", - " all_structures, ignore_errors=True\n", + " [MOF(s) for s in all_structures], ignore_errors=True\n", ") # we ignore errors here because some structures might not be fragmentable, those will then have NaNs in the feature matrix" ] }, @@ -19633,7 +19634,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:05:16) \n[Clang 12.0.1 ]" }, "orig_nbformat": 4, "vscode": { diff --git a/examples/featurize.ipynb b/examples/featurize.ipynb index 65722823..2f3e3590 100644 --- a/examples/featurize.ipynb +++ b/examples/featurize.ipynb @@ -17,7 +17,8 @@ "\n", "from mofdscribe.datasets import CoREDataset\n", "from mofdscribe.featurizers.chemistry import APRDF, RACS\n", - "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n" + "from mofdscribe.featurizers.base import MOFMultipleFeaturizer\n", + "from mofdscribe.mof import MOF\n" ] }, { @@ -108,7 +109,7 @@ ], "source": [ "racs_featurizer = RACS()\n", - "racs = racs_featurizer.featurize(next(structures))\n" + "racs = racs_featurizer.featurize(MOF(next(structures)))\n" ] }, { @@ -1977,7 +1978,7 @@ } ], "source": [ - "feats_all = featurizer.featurize_many(list(ds.get_structures(range(3))), ignore_errors=True)\n" + "feats_all = featurizer.featurize_many([MOF(s) for s in list(ds.get_structures(range(3)))], ignore_errors=True)\n" ] }, { diff --git a/setup.cfg b/setup.cfg index ef2f8b0e..62ab6d3a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -177,17 +177,17 @@ strictness = short ######################### [flake8] ignore = - S301 # pickle - S403 # pickle + S301 + S403 S404 S603 - W503 # Line break before binary operator (flake8 is wrong) - E203 # whitespace before ':' - S101 # Complaining about assert statements - D101 # Docstring missing - D102 # Docstring missing - D103 # Docstring missing - D104 # Docstring missing + W503 + E203 + S101 + D101 + D102 + D103 + D104 D400 exclude = .tox, diff --git a/src/mofdscribe/featurizers/__init__.py b/src/mofdscribe/featurizers/__init__.py index 73f995e3..e69de29b 100644 --- a/src/mofdscribe/featurizers/__init__.py +++ b/src/mofdscribe/featurizers/__init__.py @@ -1,33 +0,0 @@ -from .bu import LSOP # noqa: F401 -from .bu import PMI1 # noqa: F401 -from .bu import PMI2 # noqa: F401 -from .bu import PMI3 # noqa: F401 -from .bu import Asphericity # noqa: F401 -from .bu import BUFeaturizer # noqa: F401 -from .bu import BUMatch # noqa: F401 -from .bu import CompositionStats # noqa: F401 -from .bu import DiskLikeness # noqa: F401 -from .bu import Eccentricity # noqa: F401 -from .bu import InertialShapeFactor # noqa: F401 -from .bu import MOFBBs # noqa: F401 -from .bu import NConf20 # noqa: F401 -from .bu import PairwiseDistanceHist # noqa: F401 -from .bu import RadiusOfGyration # noqa: F401 -from .bu import RDKitAdaptor # noqa: F401 -from .bu import RodLikeness # noqa: F401 -from .bu import SphereLikeness # noqa: F401 -from .bu import SpherocityIndex # noqa: F401 -from .chemistry import AMD # noqa: F401 -from .chemistry import APRDF # noqa: F401 -from .chemistry import RACS # noqa: F401 -from .chemistry import EnergyGridHistogram # noqa: F401 -from .chemistry import Henry # noqa: F401 -from .chemistry import PartialChargeHistogram # noqa: F401 -from .chemistry import PartialChargeStats # noqa: F401 -from .chemistry import PriceLowerBound # noqa: F401 -from .pore import AccessibleVolume # noqa: F401 -from .pore import PoreDiameters # noqa: F401 -from .pore import PoreSizeDistribution # noqa: F401 -from .pore import RayTracingHistogram # noqa: F401 -from .pore import SurfaceArea # noqa: F401 -from .topology import AtomCenteredPH, PHHist, PHImage, PHStats, PHVect # noqa: F401 diff --git a/src/mofdscribe/featurizers/base.py b/src/mofdscribe/featurizers/base.py index 1436076d..761058cc 100644 --- a/src/mofdscribe/featurizers/base.py +++ b/src/mofdscribe/featurizers/base.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """Base featurizer for MOF structure based featurizers. The main purpose of these classes is currently that they handle @@ -11,34 +12,14 @@ or and iterable of pymatgen structure objects. """ from abc import abstractmethod -from functools import partial -from multiprocessing import Pool -from typing import Union +from typing import Collection, Union import numpy as np -import pandas as pd -from loguru import logger from matminer.featurizers.base import BaseFeaturizer, MultipleFeaturizer +from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure -from tqdm.auto import tqdm - -def get_primitive(structure: Union[IStructure, Structure, Molecule, IMolecule]) -> Structure: - """Get the primitive cell of a structure. - - We use this wrapper because we want to have a passtrough - for molecules, which some of our featurizers also accept. - - Args: - structure: Structure object. - - Returns: - Structure object. - """ - if isinstance(structure, (Structure, IStructure)): - return structure.get_primitive_structure() - else: - return structure +from mofdscribe.mof import MOF class MOFBaseFeaturizer(BaseFeaturizer): @@ -48,134 +29,64 @@ class MOFBaseFeaturizer(BaseFeaturizer): If you implement a new :code:`MOFBaseFeaturizer`, you need to implement a :code:`_featurize` method. - If you implement :code:`featurize` directly, - you would override the conversion to primitive cell. - - .. warning:: - - If you implement a new :code:`MOFBaseFeaturizer`, - and your featurizer needs to be fitted, keep in mind - to also call the :code:`_get_primitive` method. """ - def __init__(self, primitive: bool = False) -> None: - """ - Construct a MOFBaseFeaturizer. - - Args: - primitive (bool): If True, use the primitive cell of the structure. - Defaults to False. - """ - self.primitive = primitive + @abstractmethod + def _featurize( + self, structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph] + ) -> np.ndarray: + raise NotImplementedError("_featurize must be implemented in a subclass") - def _get_primitive_many(self, structures): - if self.n_jobs == 1: - return [self._get_primitive(s) for s in structures] - else: - with Pool(self.n_jobs, maxtasksperchild=1) as p: - res = p.map(self._get_primitive, structures, chunksize=self.chunksize) - return res + @abstractmethod + def featurize(self, mof: MOF) -> np.ndarray: + raise NotImplementedError("featurize must be implemented in a subclass") - def _get_primitive(self, structure: Union[Structure, IStructure]) -> Structure: - logger.debug("Getting primitive cell for structure in MOFBaseFeaturizer") - return get_primitive(structure) + # ToDo: make make them abstractmethods in a "FittabelMOFBaseFeaturizer" class + def _fit( + self, structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph] + ): + raise NotImplementedError("_fit is not implemented for MOFBaseFeaturizer") - @abstractmethod - def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - raise NotImplementedError("_featurize must be implemented in a subclass") + def fit(self, mofs: Union[MOF, Collection[MOF]]): + raise NotImplementedError("fit is not implemented for MOFBaseFeaturizer") - def fit(self, structures): - if not isinstance(structures, (list, tuple)): - structures = [structures] - if self.primitive: - structures = self._get_primitive_many(structures) - self._fit(structures) - def featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - """Compute the descriptor for a given structure. +class MOFBaseSiteFeaturizer(BaseFeaturizer): + """Base featurizer for MOF site based featurizers. - Args: - structure (Union[Structure, IStructure]): Structure to compute the descriptor for. + .. note:: - Returns: - A numpy array containing the descriptor. - """ - logger.debug("Featurizing structure in MOFBaseFeaturizer") - if self.primitive: - logger.debug("Getting primitive cell for structure in MOFBaseFeaturizer") - structure = get_primitive(structure) - return self._featurize(structure) + If you implement a new :code:`MOFBaseSiteFeaturizer`, + you need to implement a :code:`_featurize` method. + """ - def featurize_many(self, entries, ignore_errors=False, return_errors=False, pbar=True): - """Featurize a list of entries. + @abstractmethod + def _featurize( + self, + structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph], + idx: int, + ) -> np.ndarray: + raise NotImplementedError("_featurize must be implemented in a subclass") - If `featurize` takes multiple inputs, supply inputs as a list of tuples. + @abstractmethod + def featurize(self, mof: MOF, idx: int) -> np.ndarray: + raise NotImplementedError("featurize must be implemented in a subclass") - Featurize_many supports entries as a list, tuple, numpy array, - Pandas Series, or Pandas DataFrame. + # ToDo: make make them abstractmethods in a "FittabelMOFBaseFeaturizer" class + def _fit( + self, + structure_object: Union[Structure, IStructure, Molecule, IMolecule, StructureGraph], + sites: Union[int, Collection[int]], + ): + raise NotImplementedError("_fit is not implemented for MOFBaseSiteFeaturizer") - Args: - entries (list-like object): A list of entries to be featurized. - ignore_errors (bool): Returns NaN for entries where exceptions are - thrown if True. If False, exceptions are thrown as normal. - return_errors (bool): If True, returns the feature list as - determined by ignore_errors with traceback strings added - as an extra 'feature'. Entries which featurize without - exceptions have this extra feature set to NaN. - pbar (bool): Show a progress bar for featurization if True. - - Returns: - (list) features for each entry. - - Raises: - Exception: If entries is not a list-like object. - ValueError: If return_errors is set and ignore_errors is True. - """ - if return_errors and not ignore_errors: - raise ValueError("Please set ignore_errors to True to use" " return_errors.") - - # Check inputs - if not isinstance(entries, (tuple, list, np.ndarray, pd.Series, pd.DataFrame)): - raise Exception("'entries' must be a list-like object") - - # Special case: Empty list - if len(entries) == 0: - return [] - - # If the featurize function only has a single arg, zip the inputs - if isinstance(entries, pd.DataFrame): - entries = entries.values - elif isinstance(entries, pd.Series) or not isinstance( - entries[0], (tuple, list, np.ndarray) - ): - entries = zip(entries) - - # Add a progress bar - if pbar: - # list() required, tqdm has issues with memory if generator given - entries = tqdm(list(entries), desc=self.__class__.__name__) - - # Run the actual featurization - if self.n_jobs == 1: - return np.array( - [ - self.featurize_wrapper( - x, ignore_errors=ignore_errors, return_errors=return_errors - ) - for x in entries - ] - ) - else: - with Pool(self.n_jobs, maxtasksperchild=1) as p: - func = partial( - self.featurize_wrapper, - return_errors=return_errors, - ignore_errors=ignore_errors, - ) - res = p.map(func, entries, chunksize=self.chunksize) - return np.array(res) + def fit(self, mofs: Union[MOF, Collection[MOF]], sites: Union[int, Collection[int]]): + raise NotImplementedError("fit is not implemented for MOFBaseSiteFeaturizer") +# Probably, collect the featurizers into different groups depending on the type of input, +# i.e. graphs, molecules, structures, etc. and then have a multiple featurizer for each group. +# and this one just calls the appropriate multiple featurizer depending on the type of input. class MOFMultipleFeaturizer(MultipleFeaturizer): """Base multiplefeaturizer for MOF structure based featurizers. @@ -191,81 +102,37 @@ class MOFMultipleFeaturizer(MultipleFeaturizer): In this case, you're better off using two separate MultipleFeaturizers. """ - def __init__(self, featurizers, iterate_over_entries=True, primitive=True): + def __init__(self, featurizers): """ Construct a MOFMultipleFeaturizer. Args: featurizers (list): List of featurizers to use. - iterate_over_entries (bool): If True, featurize each entry in the list. - If False, featurize each structure in the list. Defaults to True. - primitive (bool): If True, use the primitive cell of the structure. - Defaults to True. """ - # unset the primitive on the individual featurizers - for featurizer in featurizers: - featurizer.primitive = False - + self.iterate_over_entries = True self.featurizers = featurizers - self.iterate_over_entries = iterate_over_entries - self.primitive_multiple = primitive - - def _get_primitive(self, structure: Union[Structure, IStructure]) -> Structure: - return get_primitive(structure) - - def featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - logger.debug(f"Featurizing structure in MOFMultipleFeaturizer, {self.primitive_multiple}") - if self.primitive_multiple: - logger.debug("Getting primitive cell") - structure = self._get_primitive(structure) - return np.array([feature for f in self.featurizers for feature in f.featurize(structure)]) - - def _get_primitive_many(self, structures): - if self.n_jobs == 1: - return [self._get_primitive(s) for s in structures] - else: - with Pool(self.n_jobs, maxtasksperchild=1) as p: - res = p.map(self._get_primitive, structures, chunksize=self.chunksize) - return res + + def featurize(self, mof: MOF) -> np.ndarray: + return np.array([feature for f in self.featurizers for feature in f.featurize(mof)]) def featurize_many(self, entries, ignore_errors=False, return_errors=False, pbar=True): - logger.debug("Precomputing primitive cells") - if self.primitive_multiple: - entries = self._get_primitive_many(entries) - if self.iterate_over_entries: - return np.array( - super().featurize_many( - entries, - ignore_errors=ignore_errors, - return_errors=return_errors, - pbar=pbar, - ) + return np.array( + super().featurize_many( + entries, + ignore_errors=ignore_errors, + return_errors=return_errors, + pbar=pbar, ) - else: - features = [ - f.featurize_many( - entries, - ignore_errors=ignore_errors, - return_errors=return_errors, - pbar=pbar, - ) - for f in self.featurizers - ] - - return np.hstack(features) + ) def featurize_wrapper(self, x, return_errors=False, ignore_errors=False): - if self.iterate_over_entries: - return np.array( - [ - feature - for f in self.featurizers - for feature in f.featurize_wrapper( - x, return_errors=return_errors, ignore_errors=ignore_errors - ) - ] - ) - else: - return super().featurize_wrapper( - x, return_errors=return_errors, ignore_errors=ignore_errors - ) + + return np.array( + [ + feature + for f in self.featurizers + for feature in f.featurize_wrapper( + x, return_errors=return_errors, ignore_errors=ignore_errors + ) + ] + ) diff --git a/src/mofdscribe/featurizers/bu/__init__.py b/src/mofdscribe/featurizers/bu/__init__.py index b1658dfd..fc999d9e 100644 --- a/src/mofdscribe/featurizers/bu/__init__.py +++ b/src/mofdscribe/featurizers/bu/__init__.py @@ -1,6 +1,15 @@ # -*- coding: utf-8 -*- """Featurizers operating on the secondary building units.""" -from .bu_featurizer import BUFeaturizer, MOFBBs # noqa: F401 +from .angle_hist_featurizer import PairWiseAngleHist # noqa: F401 +from .angle_stats_featurizer import PairWiseAngleStats # noqa: F401 +from .bu_featurizer import ( # noqa: F401 + BindingNumHopFeaturizer, + BindingSitesFeaturizer, + BranchingNumHopFeaturizer, + BranchingSitesFeaturizer, + BUFeaturizer, + MOFBBs, +) from .bu_matches import BUMatch # noqa: F401 from .compositionstats_featurizer import CompositionStats # noqa: F401 from .distance_hist_featurizer import PairwiseDistanceHist # noqa: F401 @@ -21,3 +30,4 @@ SphereLikeness, SpherocityIndex, ) +from .smarts_matches import AcidGroupCounter, BaseGroupCounter, SmartsMatchCounter # noqa: F401 diff --git a/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py b/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py new file mode 100644 index 00000000..e51ea5c4 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/angle_hist_featurizer.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +"""Describe molecules by computing histogram of angles A-COM-B between atoms A, B and COM. + +COM is the center of mass of the molecule. +""" +from typing import List, Union + +import numpy as np +from matminer.featurizers.base import BaseFeaturizer +from pymatgen.core import IMolecule, Molecule +from pymatgen.util.coord import get_angle + +from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule +from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin + + +@operates_on_molecule +@operates_on_imolecule +class PairWiseAngleHist(BaseFeaturizer, GetGridMixin): + _NAME = "PairWiseAngleHist" + + def __init__( + self, + lower_bound: float = 0.0, + upper_bound: float = 180.0, + bin_size: float = 20, + density: bool = True, + ) -> None: + """Create a new PairwiseDistanceHist featurizer. + + Args: + lower_bound (float): Lower bound of the histogram. + Defaults to 0.0. + upper_bound (float): Upper bound of the histogram. + Defaults to 180. + bin_size (float): Size of the bins. + Defaults to 20. + density (bool): Whether to return the density or the counts. + Defaults to True. + """ + self.lower_bound = lower_bound + self.upper_bound = upper_bound + self.bin_size = bin_size + self.density = density + + def feature_labels(self) -> List[str]: + return [ + f"{self._NAME}_{a}" + for a in self._get_grid(self.lower_bound, self.upper_bound, self.bin_size) + ] + + def _featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + # Get the pairwise distances + com = molecule.center_of_mass + angles = [] + for i in range(len(molecule)): + for j in range(len(molecule)): + if i > j: + v1 = molecule.cart_coords[i] - com + v2 = molecule.cart_coords[j] - com + angles.append(get_angle(v1, v2)) + + # Compute the histogram + features = get_rdf( + angles, + lower_lim=self.lower_bound, + upper_lim=self.upper_bound, + bin_size=self.bin_size, + density=self.density, + normalized=False, + ) + return features + + def featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + return self._featurize(molecule) + + def citations(self) -> List[str]: + return [] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py b/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py new file mode 100644 index 00000000..8469b012 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/angle_stats_featurizer.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +"""Describe molecules by computing statistics for the distribution of angles A-COM-B between atoms A, B and COM. + +COM is the center of mass of the molecule. +""" + +from typing import List, Tuple, Union + +import numpy as np +from matminer.featurizers.base import BaseFeaturizer +from pymatgen.core import IMolecule, Molecule +from pymatgen.util.coord import get_angle + +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule + + +@operates_on_molecule +@operates_on_imolecule +class PairWiseAngleStats(BaseFeaturizer): + _NAME = "PairWiseAngleStats" + + def __init__(self, aggreations: Tuple[str] = ("mean", "std", "min", "max")) -> None: + """Create a new PairwiseDistanceStats featurizer. + + Args: + aggreations (Tuple[str]): Aggreations to compute. + Defaults to ("mean", "std", "min", "max"). + """ + self.aggreations = aggreations + + def feature_labels(self) -> List[str]: + return [f"{self._NAME}_{a}" for a in self.aggreations] + + def _featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + # Get the pairwise distances + com = molecule.center_of_mass + angles = [] + for i in range(len(molecule)): + for j in range(len(molecule)): + if i > j: + v1 = molecule.cart_coords[i] - com + v2 = molecule.cart_coords[j] - com + angles.append(get_angle(v1, v2)) + + feats = [ARRAY_AGGREGATORS[a](angles) for a in self.aggreations] + + return np.array(feats) + + def featurize(self, molecule: Union[Molecule, IMolecule]) -> np.ndarray: + return self._featurize(molecule) + + def citations(self) -> List[str]: + return [] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/bu/bu_featurizer.py b/src/mofdscribe/featurizers/bu/bu_featurizer.py index 9e31461e..5090538f 100644 --- a/src/mofdscribe/featurizers/bu/bu_featurizer.py +++ b/src/mofdscribe/featurizers/bu/bu_featurizer.py @@ -1,17 +1,34 @@ # -*- coding: utf-8 -*- """Compute features on the BUs and then aggregate them.""" -from typing import Collection, List, Optional, Tuple, Union +from abc import abstractmethod +from typing import Callable, Collection, List, Optional, Tuple, Union import numpy as np -from matminer.featurizers.base import BaseFeaturizer +from matminer.featurizers.base import MultipleFeaturizer from pydantic import BaseModel +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure +from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFMultipleFeaturizer from mofdscribe.featurizers.bu.utils import boxed_molecule +from mofdscribe.featurizers.graph.numhops import NumHops from mofdscribe.featurizers.utils import nan_array, set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.mof import MOF +if False: + from moffragmentor import SBU # for type hints +__all__ = ( + "BUFeaturizer", + "BranchingSitesFeaturizer", + "BindingSitesFeaturizer", + "BranchingNumHopFeaturizer", + "BindingNumHopFeaturizer", +) + +# since fragmentation is not super straightforward, +# we give the option to provide the fragments directly class MOFBBs(BaseModel): """Container for MOF building blocks.""" @@ -19,8 +36,19 @@ class MOFBBs(BaseModel): linkers: Optional[List[Union[Structure, Molecule, IStructure, IMolecule]]] -# ToDo: Support `MultipleFeaturizer`s (should be ok, if we recursively call the operates_on method). -class BUFeaturizer(BaseFeaturizer): +def _structuregraph_from_indices(mof, indices) -> StructureGraph: + from moffragmentor.utils import remove_all_nodes_not_in_indices + + structure_graph = mof.structure_graph.__copy__() + remove_all_nodes_not_in_indices(structure_graph, indices) + return structure_graph + + +def structuregraph_from_bu(mof: "MOF", bu: "SBU") -> StructureGraph: + return _structuregraph_from_indices(mof, bu._original_indices) + + +class BUFeaturizer(MOFBaseFeaturizer): """ Compute features on the BUs and then aggregate them. @@ -47,17 +75,20 @@ class BUFeaturizer(BaseFeaturizer): mofbbs=MOFBBs(nodes=[BabelMolAdaptor.from_string( "[CH-]1C=CC=C1.[CH-]1C=CC=C1.[Fe+2]", "smi").pymatgen_mol], linkers=[BabelMolAdaptor.from_string("CCCC", "smi").pymatgen_mol])) - """ + _NAME = "BUFeaturizer" + def __init__( - self, featurizer: BaseFeaturizer, aggregations: Tuple[str] = ("mean", "std", "min", "max") + self, + featurizer: MOFBaseFeaturizer, + aggregations: Tuple[str] = ("mean", "std", "min", "max"), ) -> None: """ Construct a new BUFeaturizer. Args: - featurizer (BaseFeaturizer): The featurizer to use. + featurizer (MOFBaseFeaturizer): The featurizer to use. Currently, we do not support `MultipleFeaturizer`s. Please, instead, use multiple BUFeaturizers. If you use a featurizer that is not implemented in mofdscribe @@ -66,7 +97,12 @@ def __init__( If you do not do this, we default to assuming that it operates on structures. aggregations (Tuple[str]): The aggregations to use. Must be one of :py:obj:`ARRAY_AGGREGATORS`. + + Raises: + ValueError: If the featurizer is a `MultipleFeaturizer`. """ + if isinstance(featurizer, (MOFMultipleFeaturizer, MultipleFeaturizer)): + raise ValueError("BUFeaturizer does not support MultipleFeaturizer.") self._featurizer = featurizer self._aggregations = aggregations set_operates_on(self, featurizer) @@ -77,33 +113,34 @@ def feature_labels(self) -> List[str]: for bb in ["node", "linker"]: for aggregation in self._aggregations: for label in base_labels: - labels.append(f"{bb}_{aggregation}_{label}") + labels.append(f"{self._NAME}_{bb}_{aggregation}_{label}") return labels def _extract_bbs( self, - structure: Optional[Union[Structure, IStructure]] = None, + mof: Optional[MOF], mofbbs: Optional[MOFBBs] = None, ): - if structure is None and mofbbs is None: + if mof is None and mofbbs is None: raise ValueError("You must provide a structure or mofbbs.") - if structure is not None: - from moffragmentor import MOF - - mof = MOF.from_structure(structure) - fragments = mof.fragment() - - if self._operates_on in ("both", "molecule"): + if mof is not None: + fragments = mof.fragments + if self._operates_on & set([Molecule]): linkers = [linker.molecule for linker in fragments.linkers] nodes = [node.molecule for node in fragments.nodes] + elif self._operates_on & set([StructureGraph]): + linkers = [structuregraph_from_bu(mof, linker) for linker in fragments.linkers] + nodes = [structuregraph_from_bu(mof, node) for node in fragments.nodes] + elif self._operates_on & set([MoleculeGraph]): + linkers = [linker.molecule_graph for linker in fragments.linkers] + nodes = [node.molecule_graph for node in fragments.nodes] else: # create a boxed structure - linkers = [boxed_molecule(linker.molecule) for linker in fragments.linkers] - nodes = [boxed_molecule(node.molecule) for node in fragments.nodes] + linkers = [linker._get_boxed_structure() for linker in fragments.linkers] + nodes = [node._get_boxed_structure() for node in fragments.nodes] if mofbbs is not None: - linkers = list(mofbbs.linkers) if mofbbs.linkers is not None else [] nodes = list(mofbbs.nodes) if mofbbs.nodes is not None else [] types = [type(node) for node in nodes] + [type(linker) for linker in linkers] @@ -112,18 +149,22 @@ def _extract_bbs( raise ValueError("All nodes and linkers must be of the same type.") this_type = types[0] - if this_type in (Structure, IStructure) and self._operates_on in ("both", "structure"): + if this_type in (Structure, IStructure) and self._operates_on & set( + [Molecule, Structure] + ): # this is the simple case, we do not need to convert to molecules pass - elif this_type in (Molecule, IMolecule) and self._operates_on in ("both", "molecule"): + elif this_type in (Molecule, IMolecule) and self._operates_on & set( + [Molecule, Structure] + ): # again simple case, we do not need to convert to structures pass - elif this_type in (Molecule, IMolecule) and (self._operates_on == "structure"): + elif this_type in (Molecule, IMolecule) and (self._operates_on & set([Molecule])): # we need to convert to structures nodes = [boxed_molecule(node) for node in nodes] linkers = [boxed_molecule(linker) for linker in linkers] - elif this_type in (Structure, IStructure) and (self._operates_on == "molecule"): + elif this_type in (Structure, IStructure) and (self._operates_on & set([Molecule])): raise ValueError( "You provided structures for a featurizer that operates on molecules. " / "Cannot automatically convert to molecules from structures." @@ -133,22 +174,25 @@ def _extract_bbs( return nodes, linkers + def _fit(self): + raise NotImplementedError("BUFeaturizer does not support _fit.") + def fit( self, - structures: Optional[Collection[Structure]] = None, + mofs: Optional[Collection[MOF]] = None, mofbbs: Optional[Collection[MOFBBs]] = None, ) -> None: """ Fit the featurizer to the given structures. Args: - structures (Collection[Structure], optional): The structures to featurize. + mofs (Collection[MOF], optional): The MOFs to featurize. mofbbs (Collection[MOFBBs], optional): The MOF fragments (nodes and linkers). """ all_nodes, all_linkers = [], [] - if structures is not None: - for structure in structures: - nodes, linkers = self._extract_bbs(structure=structure) + if mofs is not None: + for mof in mofs: + nodes, linkers = self._extract_bbs(mof=mof) all_nodes.extend(nodes) all_linkers.extend(linkers) if mofbbs is not None: @@ -158,16 +202,14 @@ def fit( all_linkers.extend(linkers) all_fragments = all_nodes + all_linkers - self._featurizer.fit(all_fragments) + self._featurizer._fit(all_fragments) + + def _featurize(self): + raise NotImplementedError("BUFeaturizer does not support _featurize.") - # ToDo: - # - Perhaps use type dispatch instead of different keyword arguments. - # (we can use fastcore or code it ourselves) - # - We can also directly pass the graph to the featurizer if it want to work - # on the graph. def featurize( self, - structure: Optional[Union[Structure, IStructure]] = None, + mof: Optional[MOF] = None, mofbbs: Optional[MOFBBs] = None, ) -> np.ndarray: """ @@ -181,7 +223,7 @@ def featurize( where possible. Args: - structure (Union[Structure, IStructure], optional): The structure to featurize. + mof (MOF, optional): The structure to featurize. mofbbs (MOFBBs, optional): The MOF fragments (nodes and linkers). Returns: @@ -189,12 +231,12 @@ def featurize( """ # if i know what the featurizer wants, I can always cast to a structure num_features = len(self._featurizer.feature_labels()) - nodes, linkers = self._extract_bbs(structure, mofbbs) - linker_feats = [self._featurizer.featurize(linker) for linker in linkers] + nodes, linkers = self._extract_bbs(mof, mofbbs) + linker_feats = [self._featurizer._featurize(linker) for linker in linkers] if not linker_feats: linker_feats = [nan_array(num_features)] - node_feats = [self._featurizer.featurize(node) for node in nodes] + node_feats = [self._featurizer._featurize(node) for node in nodes] if not node_feats: node_feats = [nan_array(num_features)] @@ -215,3 +257,307 @@ def citations(self) -> List[str]: def implementors(self) -> List[str]: return self._featurizer.implementors() + + +class _BUSubBaseFeaturizer(BUFeaturizer): + def featurize( + self, + mof: Optional[MOF] = None, + ) -> np.ndarray: + """ + Compute features on the BUs and then aggregate them. + + If you provide a structure, we will fragment the MOF into BUs. + If you already have precomputed fragements or only want to consider a subset + of the BUs, you can provide them manually via the `mofbbs` argument. + + If you manually provide the `mofbbs`, we will convert molecules to structures + where possible. + + Args: + mof (MOF, optional): The structure to featurize. + + Returns: + A numpy array of features. + """ + # if i know what the featurizer wants, I can always cast to a structure + num_features = len(self._featurizer.feature_labels()) + nodes, linkers = self._extract(mof) + linker_feats = [self._featurizer._featurize(linker) for linker in linkers] + if not linker_feats: + linker_feats = [nan_array(num_features)] + + node_feats = [self._featurizer._featurize(node) for node in nodes] + if not node_feats: + node_feats = [nan_array(num_features)] + + aggregated_linker_feats = [] + for aggregation in self._aggregations: + aggregated_linker_feats.extend(ARRAY_AGGREGATORS[aggregation](linker_feats, axis=0)) + aggregated_linker_feats = np.array(aggregated_linker_feats) + + aggregated_node_feats = [] + for aggregation in self._aggregations: + aggregated_node_feats.extend(ARRAY_AGGREGATORS[aggregation](node_feats, axis=0)) + aggregated_node_feats = np.array(aggregated_node_feats) + + return np.concatenate((aggregated_node_feats, aggregated_linker_feats)) + + @abstractmethod + def _extract(self, mof: MOF) -> Tuple[List[Structure], List[Structure]]: + raise NotImplementedError() + + def fit(self, mofs: Collection[MOF]): + all_nodes, all_linkers = [], [] + for mof in mofs: + nodes, linkers = self._extract(mof) + all_nodes.extend(nodes) + all_linkers.extend(linkers) + self._featurizer.fit(all_nodes + all_linkers) + + def citations(self) -> List[str]: + return self._featurizer.citations() + + def implementors(self) -> List[str]: + return self._featurizer.implementors() + + +# ToDo: generalize extract depending on what the featurizer operates on +class BindingSitesFeaturizer(_BUSubBaseFeaturizer): + """A special BU featurizer that operates on structures spanned by "binding sites". + + We define binding sites as the linker atoms that directly connect to + the metal atoms of a node. + A good example are the carboxy oxygen atoms in a copper paddlewheel. + From more details see the `moffragmentor documentation + `_. + + + Example: + >>> from mofdscribe.mof import MOF + >>> from mofdscribe.featurizers.bu.bu_featurizer import BindingSitesFeaturizer + >>> from mofdscribe.featurizers.bu.lsop_featurizer import LSOP + >>> import pandas as pd + >>> mof = MOF.from_file("mof_file.cif") + >>> featurizer = BindingSitesFeaturizer(LSOP()) + >>> feats = featurizer.featurize(mof) + >>> pd.DataFrame([feats], columns=featurizer.feature_labels()) + """ + + _NAME = "BindingSitesFeaturizer" + + def _extract(self, mof: MOF): + if Structure in self._operates_on: + try: + linkers = [ + linker._get_binding_sites_structure() for linker in mof.fragments.linkers + ] + except Exception: + linkers = [] + try: + nodes = [node._get_binding_sites_structure() for node in mof.fragments.nodes] + except Exception: + nodes = [] + return nodes, linkers + elif StructureGraph in self._operates_on: + linkers = [ + _structuregraph_from_indices(mof, linker._original_binding_indices) + for linker in mof.fragments.linkers + ] + nodes = [ + _structuregraph_from_indices(mof, node._original_binding_indices) + for node in mof.fragments.nodes + ] + return nodes, linkers + elif Molecule in self._operates_on: + raise NotImplementedError("BindingSitesFeaturizer does not support Molecule yet.") + else: + raise NotImplementedError( + f"BindingSitesFeaturizer does not support featurizers operating on {self._operates_on}." + ) + + +class BranchingSitesFeaturizer(_BUSubBaseFeaturizer): + """A special BU featurizer that operates on structures spanned by "branching sites". + + Branching sites are defined as the sites where a linker connects to a node. + The most common example is the carbon atom in a carboxy group. + From more details see the `moffragmentor documentation + `_. + + + Example: + >>> from mofdscribe.mof import MOF + >>> from mofdscribe.featurizers.bu.bu_featurizer import BranchingSitesFeaturizer + >>> from mofdscribe.featurizers.bu.lsop_featurizer import LSOP + >>> import pandas as pd + >>> mof = MOF.from_file("mof_file.cif") + >>> featurizer = BranchingSitesFeaturizer(LSOP()) + >>> feats = featurizer.featurize(mof) + >>> pd.DataFrame([feats], columns=featurizer.feature_labels()) + """ + + _NAME = "BranchingSitesFeaturizer" + + def _extract(self, mof: MOF): + if Structure in self._operates_on: + try: + linkers = [ + linker._get_branching_sites_structure() for linker in mof.fragments.linkers + ] + except Exception: + linkers = [] + try: + nodes = [node._get_branching_sites_structure() for node in mof.fragments.nodes] + except Exception: + nodes = [] + return nodes, linkers + elif StructureGraph in self._operates_on: + linkers = [ + _structuregraph_from_indices(mof, linker._original_binding_indices) + for linker in mof.fragments.linkers + ] + nodes = [ + _structuregraph_from_indices(mof, node._original_binding_indices) + for node in mof.fragments.nodes + ] + return nodes, linkers + elif Molecule in self._operates_on: + raise NotImplementedError("BranchingSitesFeaturizer does not support Molecule yet.") + else: + raise NotImplementedError( + f"BranchingSitesFeaturizer does not support featurizers operating on {self._operates_on}." + ) + + +def _extract_branching_indices(bu): + return bu.graph_branching_indices + + +def _extract_binding_indices(bu): + return bu.binding_indices + + +class _NumSiteHops(BUFeaturizer): + _NAME = "NumBranchingSiteHops" + + def __init__(self, hop_stat_aggregations, aggregations, index_extractor: Callable): + self.hop_stat_aggregations = hop_stat_aggregations + self.aggregations = aggregations + self.index_extractor = index_extractor + self._featurizer = NumHops(self.hop_stat_aggregations) + + def _extract(self, mof: MOF): + + linkers = [linker.molecule_graph for linker in mof.fragments.linkers] + nodes = [ + node.molecule_graph for node in mof.fragments.nodes + ] # fixme: should use the dummy graph + + linker_indices = [self.index_extractor(linker) for linker in mof.fragments.linkers] + node_indices = [self.index_extractor(node) for node in mof.fragments.nodes] + return nodes, linkers, node_indices, linker_indices + + def featurize(self, mof: Optional[MOF] = None) -> np.ndarray: + nodes, linkers, node_indices, linker_indices = self._extract(mof) + node_feats = [] + + # np.array( + # [ + # self._featurizer._featurize(node, node_idx) + # for node, node_idx in zip(nodes, node_indices) + # ] + # ) + + for node, node_idx in zip(nodes, node_indices): + try: + node_feats.append(self._featurizer._featurize(node, node_idx)) + except Exception: + node_feats.append([np.nan] * len(self._featurizer.feature_labels())) + + node_feats = np.array(node_feats) + + linker_feats = [] # np.array( + # [ + # self._featurizer._featurize(linker, linker_idx) + # for linker, linker_idx in zip(linkers, linker_indices) + # ] + # ) + + for linker, linker_idx in zip(linkers, linker_indices): + try: + linker_feats.append(self._featurizer._featurize(linker, linker_idx)) + except Exception: + linker_feats.append([np.nan] * len(self._featurizer.feature_labels())) + linker_feats = np.array(linker_feats) + + node_aggregated = np.concatenate( + [ARRAY_AGGREGATORS[agg](node_feats, axis=0) for agg in self.aggregations] + ) + linker_aggregated = np.concatenate( + [ARRAY_AGGREGATORS[agg](linker_feats, axis=0) for agg in self.aggregations] + ) + return np.concatenate([node_aggregated, linker_aggregated]) + + def feature_labels(self) -> List[str]: + base_feature_labels = self._featurizer.feature_labels() + feature_labels = [] + for bb in ["node", "linker"]: + for agg in self.aggregations: + for feat in base_feature_labels: + feature_labels.append(f"{self._NAME}_{bb}_{agg}_{feat}") + return feature_labels + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] + + def citations(self) -> List[str]: + return [] + + +class BranchingNumHopFeaturizer(_NumSiteHops): + """Compute statistics on the shortest path lengths between branching sites.""" + + _NAME = "BranchingNumHopFeaturizer" + + def __init__( + self, + hop_stat_aggregations: Tuple[str] = ("mean", "std", "min", "max"), + aggregations: Tuple[str] = ("mean", "std", "min", "max"), + ): + """Construct a BranchingNumHopFeaturizer. + + Args: + hop_stat_aggregations (Tuple[str]): Aggregation functions to apply to the + shortest path lengths between branching sites on a building blocks. + Defaults to ("mean", "std", "min", "max"). + aggregations (Tuple[str]): Aggregation functions to apply to the + aggregated statistcs of the shortest path lengths between branching sites + of different building blocks of the same . + Defaults to ("mean", "std", "min", "max"). + """ + super().__init__(hop_stat_aggregations, aggregations, _extract_branching_indices) + + +class BindingNumHopFeaturizer(_NumSiteHops): + """Compute statistics on the shortest path lengths between binding sites.""" + + _NAME = "BindingNumHopFeaturizer" + + def __init__( + self, + hop_stat_aggregations: Tuple[str] = ("mean", "std", "min", "max"), + aggregations: Tuple[str] = ("mean", "std", "min", "max"), + ): + """Construct a BindingNumHopFeaturizer. + + Args: + hop_stat_aggregations (Tuple[str]): Aggregation functions to apply to the + shortest path lengths between branching sites on a building blocks. + Defaults to ("mean", "std", "min", "max"). + aggregations (Tuple[str]): Aggregation functions to apply to the + aggregated statistcs of the shortest path lengths between branching sites + of different building blocks of the same . + Defaults to ("mean", "std", "min", "max"). + """ + super().__init__(hop_stat_aggregations, aggregations, _extract_binding_indices) diff --git a/src/mofdscribe/featurizers/bu/bu_matches.py b/src/mofdscribe/featurizers/bu/bu_matches.py index 4bd3c99c..ec154caa 100644 --- a/src/mofdscribe/featurizers/bu/bu_matches.py +++ b/src/mofdscribe/featurizers/bu/bu_matches.py @@ -10,7 +10,7 @@ from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IMolecule, IStructure, Molecule, Structure -from ..utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS THIS_DIR = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(THIS_DIR, "prototype_env.json"), "r") as handle: @@ -173,6 +173,10 @@ def feature_labels(self) -> List[str]: return self._get_feature_labels() def featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: + """Structure is here spanned by the connecting points of a BU.""" + return self._featurize(s) + + def _featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: """Structure is here spanned by the connecting points of a BU.""" features = [] for topo in self.topos: diff --git a/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py b/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py index 7ec7b6a9..e6b2749d 100644 --- a/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py +++ b/src/mofdscribe/featurizers/bu/compositionstats_featurizer.py @@ -58,6 +58,9 @@ def feature_labels(self) -> List[str]: return feature_labels def featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: + return self._featurize(molecule) + + def _featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: encodings = defaultdict(list) for encoding in self.encodings: for site in molecule.sites: diff --git a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py index 17e31dea..ef443fe4 100644 --- a/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py +++ b/src/mofdscribe/featurizers/bu/distance_hist_featurizer.py @@ -13,13 +13,14 @@ operates_on_structure, ) from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin @operates_on_molecule @operates_on_imolecule @operates_on_istructure @operates_on_structure -class PairwiseDistanceHist(BaseFeaturizer): +class PairwiseDistanceHist(BaseFeaturizer, GetGridMixin): """ Describe the shape of molecules by computing a histogram of pairwise distances. @@ -32,6 +33,8 @@ class PairwiseDistanceHist(BaseFeaturizer): reported in [Zhang2022]_. """ + _NAME = "PairwiseDistanceHist" + def __init__( self, lower_bound: float = 0.0, @@ -56,13 +59,18 @@ def __init__( self.bin_size = bin_size self.density = density - def _get_grid(self): - return np.arange(self.lower_bound, self.upper_bound, self.bin_size) - def feature_labels(self) -> List[str]: - return [f"pairwise_distance_hist_{a}" for a in self._get_grid()] + return [ + f"{self._NAME}_{a}" + for a in self._get_grid(self.lower_bound, self.upper_bound, self.bin_size) + ] def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: + return self._featurize(structure) + + def _featurize( + self, structure: Union[Molecule, IMolecule, Structure, IStructure] + ) -> np.ndarray: distances = [] for i, _ in enumerate(structure): for j, _ in enumerate(structure): diff --git a/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py b/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py index d2231460..0816ab88 100644 --- a/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py +++ b/src/mofdscribe/featurizers/bu/distance_stats_featurizer.py @@ -43,6 +43,11 @@ def feature_labels(self) -> List[str]: return [f"pairwise_distance_stats_{a}" for a in self.aggregations] def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray: + return self._featurize(structure) + + def _featurize( + self, structure: Union[Molecule, IMolecule, Structure, IStructure] + ) -> np.ndarray: distances = [] for i, _ in enumerate(structure): for j, _ in enumerate(structure): diff --git a/src/mofdscribe/featurizers/bu/lsop_featurizer.py b/src/mofdscribe/featurizers/bu/lsop_featurizer.py index 63d3a42a..60f4f964 100644 --- a/src/mofdscribe/featurizers/bu/lsop_featurizer.py +++ b/src/mofdscribe/featurizers/bu/lsop_featurizer.py @@ -14,6 +14,7 @@ operates_on_molecule, operates_on_structure, ) +from mofdscribe.mof import MOF _default_types = ( "cn", @@ -88,7 +89,18 @@ def __init__( def feature_labels(self) -> List[str]: return [f"lsop_{val}" for val in self.types] - def featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + """Compute the LSOP for a fragment. + + Args: + mof (MOF): The MOF to compute the LSOP for. + + Returns: + np.ndarray: The LSOP values. + """ + return self._featurize(mof.structure) + + def _featurize(self, s: Union[Structure, IStructure, Molecule, IMolecule]) -> np.ndarray: molecule = Molecule.from_sites(s.sites) com = molecule.center_of_mass orginal_len = len(molecule) diff --git a/src/mofdscribe/featurizers/bu/nconf20_featurizer.py b/src/mofdscribe/featurizers/bu/nconf20_featurizer.py index b5b3a245..de38d89e 100644 --- a/src/mofdscribe/featurizers/bu/nconf20_featurizer.py +++ b/src/mofdscribe/featurizers/bu/nconf20_featurizer.py @@ -7,7 +7,7 @@ from rdkit import Chem from rdkit.Chem import AllChem -from .rdkitadaptor import RDKitAdaptor +from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor # ToDo: Allow to set some of the options, e.g. for UFF diff --git a/src/mofdscribe/featurizers/bu/racs.py b/src/mofdscribe/featurizers/bu/racs.py new file mode 100644 index 00000000..f4c55b78 --- /dev/null +++ b/src/mofdscribe/featurizers/bu/racs.py @@ -0,0 +1,282 @@ +# -*- coding: utf-8 -*- +"""RACs on molecule and structure graphs with optional community detection.""" +from collections import OrderedDict, defaultdict +from typing import Collection, Dict, List, Optional, Set, Tuple, Union + +import networkx.algorithms.community as nx_comm +import numpy as np +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph + +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.chemistry.racs import compute_racs +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.definitions import ( + ALL_ELEMENTS, + ALL_ELEMENTS_EXCEPT_C_H, + ALL_METAL_ELEMENTS, + ALL_NONMETAL_ELEMENTS, +) +from mofdscribe.featurizers.utils.extend import ( + operates_on_moleculegraph, + operates_on_structuregraph, +) +from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope +from mofdscribe.mof import MOF + + +def _get_site_iter(structuregraph: Union[StructureGraph, MoleculeGraph]): + if isinstance(structuregraph, StructureGraph): + return structuregraph.structure.sites + elif isinstance(structuregraph, MoleculeGraph): + return structuregraph.molecule.sites + else: + raise ValueError("structuregraph must be either a StructureGraph or a MoleculeGraph") + + +def _get_atom_site_indices( + structuregraph: Union[StructureGraph, MoleculeGraph], + atom_groups: Collection[Tuple[str, List[str], bool]], +): + """Get the indices of the sites that belong to the atom groups.""" + atom_grouped_indices = defaultdict(set) + for atom_group_name, elements, _no_terminal in atom_groups: + for i, site in enumerate(_get_site_iter(structuregraph)): + if site.specie.symbol in elements: + atom_grouped_indices[atom_group_name].add(i) + return atom_grouped_indices + + +def _split_up_communities( + structuregraph: Union[StructureGraph, MoleculeGraph], + communities: List[List[int]], + atom_groups: Collection[Tuple[str, List[str], bool]], +): + """Create dictionary of communities with atom groups as keys.""" + atom_grouped_communities = defaultdict(list) + indices_to_atom_group = _get_atom_site_indices( + structuregraph=structuregraph, atom_groups=atom_groups + ) + + for atom_group_name, _elements, _no_terminal in atom_groups: + for community in communities: + if not isinstance(community, set): + communities_set = set([community]) + else: + communities_set = community + atom_grouped_communities[atom_group_name].append( + indices_to_atom_group[atom_group_name] & set(communities_set) + ) + + return atom_grouped_communities + + +# ToDo: generalize the code and implement in only one place +# the main racs code is quite similar to this one +def _get_racs_for_community( + community_indices, + structure_graph: StructureGraph, + neighbors_at_distance: Dict[int, Dict[int, Set[int]]], + properties: Tuple[str], + scopes: List[int], + property_aggregations: Tuple[str], + corr_aggregations: Tuple[str], + community_aggregations: Tuple[str], + community_name: str = "", +): + community_racs = defaultdict(list) + + if not community_indices: + # one nested list to make it trigger filling it with nan values + community_indices = [[]] + for start_indices in community_indices: + for scope in scopes: + racs = compute_racs( + start_indices, + structure_graph, + neighbors_at_distance, + properties, + scope, + property_aggregations, + corr_aggregations, + community_name, + ) + for k, v in racs.items(): + community_racs[k].append(v) + + aggregated_racs = {} + for racs_name, racs_values in community_racs.items(): + for community_agg in community_aggregations: + agg_func = ARRAY_AGGREGATORS[community_agg] + name = f"{racs_name}__atomgroupagg-{community_agg}" + aggregated_racs[name] = agg_func(racs_values) + + return aggregated_racs + + +@operates_on_structuregraph +@operates_on_moleculegraph +class ModularityCommunityCenteredRACS(MOFBaseFeaturizer): + """RACs on molecule and structure graphs with optional community detection. + + This featurizer is a flavor of the :ref:`RACs ` featurizer. + It can split the computation over user-defined atom groups and automatically determined communities. + For determining communities, we use ``networkx``'s implementation + of greedy modularity maximization [NewmanModularity]_. + + The features are computed for each communinty within each atom group and then aggregated over the communities. + That is, the number of features will depend on the number of atom groups. + The communities will only impact how the features within an atom group are aggregated. + + There are different ways in which you might want to use this featurizer. + + 1) No prior on communitiy structure. In this case, you can set ``dont_use_communities=True``. + It will still compute the RACs over the atom groups you specify but won't have an inner loop over communities + that are then aggegated for the final features for a given atom group. + + 2) No prior on atom grouping. In this case, you can set ``atom_groups=None``. + This will compute the RACs over all atoms. + """ + + _NAME = "ModularityCommunityCenteredRACS" + + def __init__( + self, + atom_groups: Optional[Collection[Tuple[str, Collection[str], bool]]] = None, + attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"), + scopes: Tuple[int] = (1, 2, 3), + prop_agg: Tuple[str] = ("product", "diff"), + corr_agg: Tuple[str] = ("sum", "avg"), + atom_groups_agg: Tuple[str] = ("avg", "sum"), + dont_use_communities: bool = False, + ): + """Constuct a ModularityCommunityCenteredRACS featurizer. + + Features are computed for each atom group and for each community. The features are then aggregated + over the communities. + + Args: + atom_groups (Optional[Collection[Tuple[str, Collection[str], bool]]], optional): + Elements which form start scopes for RACs. + The first element is the name of the atom group, and will appear in the feature name. + The second element is a list of elements that belong to the atom group. For example, + ('C-H', ['C', 'H'], False) will create a feature for all carbon atoms and hydrogen atoms. + The third element of the tuple is currently not used (but will be used as a flag to + indicate whether to include terminal/leading to terminal atoms in the start scope). + If set to None, there will be one atom group with all atoms. + Defaults to None. + attributes (Tuple[Union[int, str]], optional): + Elemental properties used for the construction of RACs. Defaults to ("X", "mod_pettifor", "I", "T"). + scopes (Tuple[int], optional): Scopes used for the construction of RACs. + Defaults to (1, 2, 3). + prop_agg (Tuple[str], optional): Aggregation methods used for the aggregation of + properties. "Product" corresponds to "product-RACs" and "diff" to "difference-RACs". + Defaults to ("product", "diff"). + corr_agg (Tuple[str], optional): Aggregation methods used for the aggregation of the + correlated properties. Defaults to ("sum", "avg"). + atom_groups_agg (Tuple[str], optional): Aggregation methods used for the pooling + over atom communities within one atom group. Defaults to ("avg", "sum"). + dont_use_communities (bool): If set to true, we do not use modularity-based community detection. + Features are then simply averaged over all atoms. + Defaults to False. + """ + if atom_groups is None: + atom_groups = [("all", ALL_ELEMENTS, False)] + self.atom_groups = atom_groups + self.attributes = attributes + self.scopes = scopes + self.prop_agg = prop_agg + self.corr_agg = corr_agg + self.atom_groups_agg = atom_groups_agg + self.dont_use_communities = dont_use_communities + + @classmethod + def from_preset(cls, preset: str, **kwargs): + if preset.lower() == "cof": + atom_groups = [ + ("all", ALL_ELEMENTS, False), + ("C-H", {"C", "H"}, False), + ("not_C-H", ALL_ELEMENTS_EXCEPT_C_H, False), + ] + elif preset.lower() == "mof": + atom_groups = [ + ("all", ALL_ELEMENTS, False), + ("metal", ALL_METAL_ELEMENTS, False), + ("nonmetal", ALL_NONMETAL_ELEMENTS, False), + ] + return cls(atom_groups=atom_groups, **kwargs) + + def _featurize(self, structuregraph: Union[StructureGraph, MoleculeGraph]): + if not self.dont_use_communities: + communities = list( + nx_comm.greedy_modularity_communities(structuregraph.graph.to_undirected()) + ) + else: + communities = [range(len(structuregraph))] # make one community with all atoms + + neighbors_at_distance = { + i: get_neighbors_up_to_scope(structuregraph, i, max(self.scopes)) + for i in range(len(structuregraph)) + } + + groups = _split_up_communities(structuregraph, communities, self.atom_groups) + + racs = {} + for group_name, group_indices in groups.items(): + racs.update( + _get_racs_for_community( + group_indices, + structuregraph, + neighbors_at_distance, + self.attributes, + self.scopes, + self.prop_agg, + self.corr_agg, + self.atom_groups_agg, + group_name, + ) + ) + + racs_ordered = OrderedDict(sorted(racs.items())) + return np.array(list(racs_ordered.values())) + + def featurize(self, mof: MOF): + return self._featurize(mof.structure_graph) + + def _get_feature_labels(self): + names = [] + for atom_group_name, _, _ in self.atom_groups: + for scope in self.scopes: + for prop in self.attributes: + for property_agg in self.prop_agg: + for cor_agg in self.corr_agg: + for atom_group_agg in self.atom_groups_agg: + names.append( + f"{self._NAME}-{atom_group_name}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_atomgroupagg-{atom_group_agg}" # noqa: E501 + ) + + names = sorted(names) + return names + + def feature_labels(self): + return self._get_feature_labels() + + def citations(self) -> List[str]: + return [ + "@article{Janet2017," + "doi = {10.1021/acs.jpca.7b08750}," + "url = {https://doi.org/10.1021/acs.jpca.7b08750}," + "year = {2017}," + "month = nov," + "publisher = {American Chemical Society ({ACS})}," + "volume = {121}," + "number = {46}," + "pages = {8939--8954}," + "author = {Jon Paul Janet and Heather J. Kulik}," + "title = {Resolving Transition Metal Chemical Space: " + "Feature Selection for Machine Learning and Structure{\textendash}Property Relationships}," + "journal = {The Journal of Physical Chemistry A}" + "}", + ] + + def implementors(self): + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/bu/rdkitadaptor.py b/src/mofdscribe/featurizers/bu/rdkitadaptor.py index 8370871a..02b23d24 100644 --- a/src/mofdscribe/featurizers/bu/rdkitadaptor.py +++ b/src/mofdscribe/featurizers/bu/rdkitadaptor.py @@ -10,8 +10,8 @@ from pymatgen.core import Molecule from structuregraph_helpers.create import get_local_env_method -from .utils import create_rdkit_mol_from_mol_graph -from ..utils.extend import operates_on_imolecule, operates_on_molecule +from mofdscribe.featurizers.bu.utils import create_rdkit_mol_from_mol_graph +from mofdscribe.featurizers.utils.extend import operates_on_imolecule, operates_on_molecule @operates_on_molecule @@ -70,6 +70,21 @@ def featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray: If the input molecule is a Molecule, we convert it to a MoleculeGraph using the local environment strategy specified in the constructor. + Args: + molecule: A pymatgen Molecule or MoleculeGraph object. + + Returns: + A numpy array of features. + """ + return self._featurize(molecule) + + def _featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray: + """ + Call the RDKit featurizer on the molecule. + + If the input molecule is a Molecule, we convert it to a MoleculeGraph + using the local environment strategy specified in the constructor. + Args: molecule: A pymatgen Molecule or MoleculeGraph object. @@ -82,9 +97,13 @@ def featurize(self, molecule: Union[Molecule, MoleculeGraph]) -> np.ndarray: molecule_graph = MoleculeGraph.with_local_env_strategy( molecule, get_local_env_method(self._local_env_strategy) ) - rdkit_mol = create_rdkit_mol_from_mol_graph( - molecule_graph, force_sanitize=self._force_sanitize - ) + try: + rdkit_mol = create_rdkit_mol_from_mol_graph( + molecule_graph, force_sanitize=self._force_sanitize + ) + except Exception: + logger.warning("Could not create RDKit molecule from pymatgen molecule.") + return np.array([np.nan] * len(self._feature_labels)) feats = self._featurizer(rdkit_mol) if isinstance(feats, (list, tuple, np.ndarray)): return np.asarray(feats) diff --git a/src/mofdscribe/featurizers/bu/shape_featurizer.py b/src/mofdscribe/featurizers/bu/shape_featurizer.py index c1b1002f..fd920efe 100644 --- a/src/mofdscribe/featurizers/bu/shape_featurizer.py +++ b/src/mofdscribe/featurizers/bu/shape_featurizer.py @@ -13,7 +13,7 @@ from rdkit.Chem.Descriptors3D import RadiusOfGyration as RadiusOfGyration_RDKIT from rdkit.Chem.Descriptors3D import SpherocityIndex as SpherocityIndex_RDKIT -from .rdkitadaptor import RDKitAdaptor +from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor def _rod_likeness(mol): diff --git a/src/mofdscribe/featurizers/bu/smarts_matches.py b/src/mofdscribe/featurizers/bu/smarts_matches.py index ae5b141d..86465d13 100644 --- a/src/mofdscribe/featurizers/bu/smarts_matches.py +++ b/src/mofdscribe/featurizers/bu/smarts_matches.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """Featurize a molecule using SMARTS matches.""" from functools import partial from typing import Collection, List, Optional diff --git a/src/mofdscribe/featurizers/chemistry/__init__.py b/src/mofdscribe/featurizers/chemistry/__init__.py index ade4df4c..f569d3e3 100644 --- a/src/mofdscribe/featurizers/chemistry/__init__.py +++ b/src/mofdscribe/featurizers/chemistry/__init__.py @@ -4,6 +4,7 @@ from .aprdf import APRDF # noqa: F401 from .energygrid import EnergyGridHistogram # noqa: F401 from .henry import Henry # noqa: F401 +from .numatoms import NumAtoms # noqa: F401 from .partialchargehistogram import PartialChargeHistogram # noqa: F401 from .partialchargestats import PartialChargeStats # noqa: F401 from .price import PriceLowerBound # noqa: F401 diff --git a/src/mofdscribe/featurizers/chemistry/amd.py b/src/mofdscribe/featurizers/chemistry/amd.py index 6daffeda..55068171 100644 --- a/src/mofdscribe/featurizers/chemistry/amd.py +++ b/src/mofdscribe/featurizers/chemistry/amd.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- """Generalized average minimum distance (AMD) featurizer.""" -from typing import List, Tuple, Union +from typing import List, Tuple import numpy as np -from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.substructures import filter_element +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType __all__ = ["AMD"] @@ -43,7 +44,6 @@ def __init__( ), compute_for_all_elements: bool = True, aggregations: Tuple[str] = ("mean",), - primitive: bool = True, ) -> None: """Initialize the AMD descriptor. @@ -60,8 +60,6 @@ def __init__( the original structure with all elements. Defaults to True. aggregations (tuple): Aggregations of the AMD descriptor. The 'mean' is equivalent to the original AMD. Defaults to ('mean',). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.k = k atom_types = [] if atom_types is None else atom_types @@ -71,7 +69,6 @@ def __init__( ) self.compute_for_all_elements = compute_for_all_elements self.aggregations = aggregations - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -85,21 +82,33 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() - def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: """Compute the AMD descriptor for a given structure. Args: - structure (Union[Structure, IStructure]): Structure to compute the descriptor for. + structure (StructureIStructureType): Structure to compute the descriptor for. Returns: A numpy array containing the AMD descriptor. + + Raises: + ImportError: If the AMD package is not installed. """ - from amd._nns import nearest_neighbours - from amd.calculate import _extract_motif_cell - from amd.periodicset import PeriodicSet + try: + from amd._nns import nearest_neighbours + from amd.calculate import _extract_motif_cell + from amd.periodicset import PeriodicSet + except ImportError: + raise ImportError( + "AMD featurizer requires the AMD package.\ + See https://github.com/dwiddo/average-minimum-distance for installation instructions." + ) def get_pdd(structure, k): - motif, cell, asymmetric_unit, multiplicities = _extract_motif_cell( + motif, cell, asymmetric_unit, _multiplicities = _extract_motif_cell( PeriodicSet(structure.cart_coords, structure.lattice.matrix) ) pdd, _, _ = nearest_neighbours(motif, cell, asymmetric_unit, k) diff --git a/src/mofdscribe/featurizers/chemistry/aprdf.py b/src/mofdscribe/featurizers/chemistry/aprdf.py index 7cbe7efc..3376d7be 100644 --- a/src/mofdscribe/featurizers/chemistry/aprdf.py +++ b/src/mofdscribe/featurizers/chemistry/aprdf.py @@ -9,12 +9,12 @@ import numpy as np from element_coder import encode -from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer - -from ..utils.aggregators import AGGREGATORS -from ..utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.featurizers.utils.aggregators import AGGREGATORS +from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType __all__ = ["APRDF"] @@ -50,7 +50,6 @@ def __init__( properties: Tuple[str, int] = ("X", "electron_affinity"), aggregations: Tuple[str] = ("avg", "product", "diff"), normalize: bool = False, - primitive: bool = True, ): """Set up an atomic property (AP) weighted radial distribution function. @@ -73,8 +72,6 @@ def __init__( options. Defaults to ("avg", "product", "diff"). normalize (bool): If True, the histogram is normalized by dividing by the number of atoms. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.lower_lim = lower_lim self.cutoff = cutoff @@ -84,7 +81,6 @@ def __init__( self.b_smear = b_smear self.aggregations = aggregations - super().__init__(primitive=primitive) def precheck(self): pass @@ -106,7 +102,10 @@ def _get_feature_labels(self): return list(aprdfs.flatten()) - def _featurize(self, s: Union[Structure, IStructure]) -> np.array: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.array: bins = self._bins aprdfs = np.zeros((len(self.properties), len(self.aggregations), len(bins))) diff --git a/src/mofdscribe/featurizers/chemistry/energygrid.py b/src/mofdscribe/featurizers/chemistry/energygrid.py index b9fd4a81..1ccee4d1 100644 --- a/src/mofdscribe/featurizers/chemistry/energygrid.py +++ b/src/mofdscribe/featurizers/chemistry/energygrid.py @@ -9,11 +9,13 @@ from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa - -from ..utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType __all__ = ["EnergyGridHistogram"] GRID_INPUT_TEMPLATE = """SimulationType MakeASCIGrid @@ -63,7 +65,7 @@ def read_ascii_grid(filename: Union[str, os.PathLike]) -> pd.DataFrame: @operates_on_istructure @operates_on_structure -class EnergyGridHistogram(MOFBaseFeaturizer): +class EnergyGridHistogram(MOFBaseFeaturizer, GetGridMixin): """Computes the energy grid histograms as originally proposed by Bucior et al. [Bucior2019]_. Conventionally, energy grids can be used to speed up molecular simulations. @@ -87,6 +89,8 @@ class EnergyGridHistogram(MOFBaseFeaturizer): https://doi.org/10.1063/5.0050823. """ + _NAME = "EnergyGridHistogram" + def __init__( self, raspa_dir: Union[str, os.PathLike, None] = None, @@ -104,7 +108,6 @@ def __init__( shifted: bool = False, separate_interactions: bool = True, run_eqeq: bool = True, - primitive: bool = False, ): """Construct the EnergyGridHistogram class. @@ -147,8 +150,6 @@ def __init__( Defaults to True. run_eqeq (bool): If true, runs EqEq to compute charges. Defaults to True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. Raises: ValueError: If the `raspa_dir` is not a valid directory. @@ -175,7 +176,6 @@ def __init__( self.shifted = shifted self.separate_interactions = separate_interactions self.run_eqeq = run_eqeq - super().__init__(primitive=primitive) def fit_transform(self, structures: List[Union[Structure, IStructure]]): ... @@ -183,18 +183,18 @@ def fit_transform(self, structures: List[Union[Structure, IStructure]]): def fit(self, structure: Union[Structure, IStructure]): ... - def _get_grid(self): - return np.arange(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw) - def feature_labels(self) -> List[str]: - grid = self._get_grid() + grid = self._get_grid(self.min_energy_vdw, self.max_energy_vdw, self.bin_size_vdw) labels = [] for site in self.sites: for grid_point in grid: - labels.append(f"energygridhist_{self.mol_name}_{site}_{grid_point}") + labels.append(f"{self._NAME}_{self.mol_name}_{site}_{grid_point}") return labels - def _featurize(self, s: Union[Structure, IStructure]) -> np.array: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.array: ff_molecules = {self.mol_name: self.mol_ff} parameters = { diff --git a/src/mofdscribe/featurizers/chemistry/henry.py b/src/mofdscribe/featurizers/chemistry/henry.py index 638fa6f0..fde1bbf2 100644 --- a/src/mofdscribe/featurizers/chemistry/henry.py +++ b/src/mofdscribe/featurizers/chemistry/henry.py @@ -5,13 +5,14 @@ from typing import List, Union import numpy as np -from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.raspa.base_parser import parse_base_output from mofdscribe.featurizers.utils.raspa.resize_uc import resize_unit_cell from mofdscribe.featurizers.utils.raspa.run_raspa import detect_raspa_dir, run_raspa +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType __all__ = ["Henry"] @@ -83,7 +84,6 @@ def __init__( separate_interactions: bool = True, run_eqeq: bool = True, return_std: bool = False, - primitive: bool = False, ): """Initialize the featurizer. @@ -118,8 +118,6 @@ def __init__( Defaults to True. return_std (bool): If true, return the standard deviations. Defaults to False. - primitive (bool): If true, use the primitive unit cell. - Defaults to False. Raises: ValueError: If the `RASPA_DIR` environment variable is not set. @@ -144,9 +142,11 @@ def __init__( self.temperature = temperature self.run_eqeq = run_eqeq self.return_std = return_std - super().__init__(primitive=primitive) - def _featurize(self, s: Union[Structure, IStructure]) -> np.array: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.array: ff_molecules = {self.mol_name: self.mol_ff} parameters = { diff --git a/src/mofdscribe/featurizers/chemistry/numatoms.py b/src/mofdscribe/featurizers/chemistry/numatoms.py new file mode 100644 index 00000000..fd4632fc --- /dev/null +++ b/src/mofdscribe/featurizers/chemistry/numatoms.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +"""Atom count featurizer.""" +from typing import List, Union + +import numpy as np +from pymatgen.core import IMolecule, IStructure, Molecule, Structure + +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import ( + operates_on_imolecule, + operates_on_istructure, + operates_on_molecule, + operates_on_structure, +) +from mofdscribe.mof import MOF + + +@operates_on_structure +@operates_on_istructure +@operates_on_molecule +@operates_on_imolecule +class NumAtoms(MOFBaseFeaturizer): + """Featurizer that returns the number of atoms in a structure.""" + + def __init__(self) -> None: + """Construct a new NumAtoms featurizer.""" + super().__init__() + + def _featurize( + self, structure_object: Union[Structure, IStructure, Molecule, IMolecule] + ) -> np.ndarray: + if not isinstance(structure_object, (Structure, IStructure, Molecule, IMolecule)): + raise ValueError("Structure object must be a pymatgen Structure or Molecule.") + return np.array([len(structure_object)]) + + def featurize(self, mof: MOF) -> int: + return len(mof.structure) + + def feature_labels(self) -> List[str]: + return ["num_atoms"] + + def citations(self) -> List[str]: + return [] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py index a46000d5..7c5e43bd 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargehistogram.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """Partial charge histogram featurizer.""" -from typing import List, Union +from typing import List import numpy as np from pymatgen.core import IStructure, Structure @@ -9,25 +9,29 @@ from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure from mofdscribe.featurizers.utils.histogram import get_rdf +from mofdscribe.featurizers.utils.mixins import GetGridMixin +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType __all__ = ["PartialChargeHistogram"] @operates_on_istructure @operates_on_structure -class PartialChargeHistogram(MOFBaseFeaturizer): +class PartialChargeHistogram(MOFBaseFeaturizer, GetGridMixin): """Compute partial charges using the EqEq charge equilibration method [Ongari2019]_. Then derive a fix-length feature vector from the partial charges by binning charges in a histogram. """ + _NAME = "PartialChargeHistogram" + def __init__( self, min_charge: float = -4, max_charge: float = 4, bin_size: float = 0.5, - primitive: bool = False, ) -> None: """Construct a new PartialChargeHistogram featurizer. @@ -38,21 +42,21 @@ def __init__( Defaults to 4. bin_size (float): Bin size. Defaults to 0.5. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.min_charge = min_charge self.max_charge = max_charge self.bin_size = bin_size - super().__init__(primitive=primitive) - - def _get_grid(self): - return np.arange(self.min_charge, self.max_charge, self.bin_size) def feature_labels(self) -> List[str]: - return [f"chargehist_{val}" for val in self._get_grid()] + return [ + f"{self._NAME}_{val}" + for val in self._get_grid(self.min_charge, self.max_charge, self.bin_size) + ] + + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(s=mof.structure) - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def _featurize(self, s: StructureIStructureType) -> np.ndarray: if isinstance(s, Structure): s = IStructure.from_sites(s.sites) _, results = get_eqeq_charges(s) diff --git a/src/mofdscribe/featurizers/chemistry/partialchargestats.py b/src/mofdscribe/featurizers/chemistry/partialchargestats.py index d9e0d581..fd668e85 100644 --- a/src/mofdscribe/featurizers/chemistry/partialchargestats.py +++ b/src/mofdscribe/featurizers/chemistry/partialchargestats.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """Partial charge statistics featurizer.""" -from typing import List, Tuple, Union +from typing import List, Tuple import numpy as np from pymatgen.core import IStructure, Structure @@ -9,6 +9,8 @@ from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.eqeq import get_eqeq_charges from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType __all__ = ["PartialChargeStats"] @@ -26,9 +28,7 @@ class PartialChargeStats(MOFBaseFeaturizer): `_ """ - def __init__( - self, aggregations: Tuple[str] = ("max", "min", "std"), primitive: bool = True - ) -> None: + def __init__(self, aggregations: Tuple[str] = ("max", "min", "std")) -> None: """ Initialize the PartialChargeStats featurizer. @@ -36,16 +36,16 @@ def __init__( aggregations (Tuple[str]): Aggregations to compute. For available methods, see :py:obj:`mofdscribe.featurizers.utils.aggregators.ARRAY_AGGREGATORS`. Defaults to ("max", "min", "std"). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.aggregations = aggregations - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: return [f"chargestat_{agg}" for agg in self.aggregations] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(s=mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: if isinstance(s, Structure): s = IStructure.from_sites(s.sites) _, results = get_eqeq_charges(s) diff --git a/src/mofdscribe/featurizers/chemistry/price.py b/src/mofdscribe/featurizers/chemistry/price.py index 0135217e..a89f5d83 100644 --- a/src/mofdscribe/featurizers/chemistry/price.py +++ b/src/mofdscribe/featurizers/chemistry/price.py @@ -9,6 +9,9 @@ import pandas as pd from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType _THIS_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -75,6 +78,8 @@ def _price_per_atom(element_price_fractions_kg, structure): ) +@operates_on_istructure +@operates_on_structure class PriceLowerBound(MOFBaseFeaturizer): """Compute a lower bound for the price based on the element prices. @@ -101,23 +106,18 @@ class PriceLowerBound(MOFBaseFeaturizer): "per_atom": _price_per_atom, } - def __init__( - self, projections: Tuple[str] = ("gravimetric", "volumetric"), primitive: bool = False - ): + def __init__(self, projections: Tuple[str] = ("gravimetric", "volumetric")): """Initialize the PriceLowerBound featurizer. Args: projections (Tuple[str]): List of projections to use. Possible values are "gravimetric" and "volumetric". Default is ("gravimetric", "volumetric"). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. """ self.projections = projections self.element_masses = _get_element_abundance() self.projections = projections - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: labels = [] @@ -125,7 +125,10 @@ def feature_labels(self) -> List[str]: labels.append(f"price_lower_bound_{projection}") return labels - def _featurize(self, structure) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: element_masses = {} for site in structure.sites: diff --git a/src/mofdscribe/featurizers/chemistry/racs.py b/src/mofdscribe/featurizers/chemistry/racs.py index fe6981f3..a252a554 100644 --- a/src/mofdscribe/featurizers/chemistry/racs.py +++ b/src/mofdscribe/featurizers/chemistry/racs.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Revised autocorrelation functions (RACs) for MOFs.""" from collections import OrderedDict, defaultdict -from typing import Collection, List, Optional, Tuple, Union +from typing import Collection, Dict, List, Optional, Set, Tuple, Union import numpy as np from element_coder import encode @@ -10,22 +10,28 @@ from pymatgen.core import IStructure, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.utils.aggregators import AGGREGATORS, ARRAY_AGGREGATORS -from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure +from mofdscribe.featurizers.utils.extend import ( + operates_on_istructure, + operates_on_structure, + operates_on_structuregraph, +) from mofdscribe.featurizers.utils.structure_graph import ( get_connected_site_indices, - get_neighbors_at_distance, + get_neighbors_up_to_scope, get_structure_graph, ) +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType -from ._fragment import get_bb_indices - -__all__ = ("RACS",) +__all__ = ("RACS", "compute_racs") -def _compute_racs( +def compute_racs( start_indices: Collection[int], structure_graph: StructureGraph, + neighbors_at_distance: Dict[int, Dict[int, Set[int]]], properties: Tuple[str], scope: int, property_aggregations: Tuple[str], @@ -33,6 +39,22 @@ def _compute_racs( part_name: str = "", nan_value: float = np.nan, ): + """Compute the RACs for a given set of properties and scope. + + Args: + start_indices (Collection[int]): The indices of the sites to start from. + structure_graph (StructureGraph): The structure graph. + neighbors_at_distance (Dict[int, Dict[int, Set[int]]], optional): The neighbors at distance. Defaults to None. + properties (Tuple[str]): The properties that are correlated + scope (int): The scope of the RACs. + property_aggregations (Tuple[str]): The aggregations to perform on the properties. + corr_aggregations (Tuple[str]): The aggregations to perform on the correlations. + part_name (str): The name of the part. Defaults to "". + nan_value (float): The value to use for missing values. Defaults to np.nan. + + Returns: + Dict[str, float]: The RACs. + """ racs = defaultdict(lambda: defaultdict(list)) if len(start_indices) == 0: logger.debug(f"No start indices for {part_name}") @@ -41,7 +63,7 @@ def _compute_racs( racs[prop][agg].append(nan_value) else: for start_atom in start_indices: - _, neighbors = get_neighbors_at_distance(structure_graph, start_atom, scope) + neighbors = neighbors_at_distance[start_atom][scope] num_neighbors = len(neighbors) # We only branch if there are actually neighbors scope many bonds away ... @@ -76,7 +98,7 @@ def _compute_racs( for aggregation_name, aggregation_values in property_values.items(): for corr_agg in corr_aggregations: agg_func = ARRAY_AGGREGATORS[corr_agg] - name = f"racs_bb-{part_name}_prop-{property_name}_scope-{scope}_propagg-{aggregation_name}_corragg-{corr_agg}" # noqa: E501 + name = f"racs-{part_name}_prop-{property_name}_scope-{scope}_propagg-{aggregation_name}_corragg-{corr_agg}" # noqa: E501 aggregated_racs[name] = agg_func(aggregation_values) return aggregated_racs @@ -85,6 +107,7 @@ def _compute_racs( def _get_racs_for_bbs( bb_indices: Collection[int], structure_graph: StructureGraph, + neighbors_at_distance: Dict[int, Dict[int, Set[int]]], properties: Tuple[str], scopes: List[int], property_aggregations: Tuple[str], @@ -99,9 +122,10 @@ def _get_racs_for_bbs( bb_indices = [[]] for start_indices in bb_indices: for scope in scopes: - racs = _compute_racs( + racs = compute_racs( start_indices, structure_graph, + neighbors_at_distance, properties, scope, property_aggregations, @@ -123,6 +147,7 @@ def _get_racs_for_bbs( @operates_on_istructure @operates_on_structure +@operates_on_structuregraph class RACS(MOFBaseFeaturizer): r"""Modified version of the revised autocorrelation functions (RACs) for MOFs. @@ -151,6 +176,8 @@ class RACS(MOFBaseFeaturizer): `_. """ + _NAME = "RACS" + def __init__( self, attributes: Tuple[Union[int, str]] = ("X", "mod_pettifor", "I", "T"), @@ -166,7 +193,6 @@ def __init__( "linker_scaffold", "nodes", ), - primitive: bool = True, ) -> None: """ Initialize the RACS featurizer. @@ -184,8 +210,6 @@ def __init__( bond_heuristic (str): Method used to guess bonds. Defaults to "vesta". bbs (Tuple[str]): Building blocks to use. Defaults to ("linker_all", "linker_connecting", "linker_functional", "linker_scaffold", "nodes"). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.attributes = attributes self.scopes = scopes @@ -203,12 +227,25 @@ def __init__( "linker_scaffold", "nodes", ] - super().__init__(primitive=primitive) - def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: - if isinstance(structure, Structure): + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure_graph) + + def _featurize(self, structure: Union[StructureIStructureType, StructureGraph]) -> np.ndarray: + if isinstance(structure, (Structure, IStructure)): structure = IStructure.from_sites(structure) - sg = get_structure_graph(structure, self.bond_heuristic) + sg = get_structure_graph(structure, self.bond_heuristic) + elif isinstance(structure, StructureGraph): + sg = structure + structure = sg.structure + else: + raise TypeError( + f"Structure must be pymatgen Structure or StructureGraph, found {type(structure)}" + ) + + neighbors_at_distance = { + i: get_neighbors_up_to_scope(sg, i, max(self.scopes)) for i in range(len(structure)) + } racs = {} # This finds all indices for a particular subset of atoms @@ -219,6 +256,7 @@ def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: _get_racs_for_bbs( bb_indices[bb], sg, + neighbors_at_distance, self.attributes, self.scopes, self.prop_agg, @@ -239,7 +277,7 @@ def _get_feature_labels(self) -> List[str]: for cor_agg in self.corr_agg: for bb_agg in self.bb_agg: names.append( - f"racs_bb-{bb}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_bbagg-{bb_agg}" # noqa: E501 + f"{self._NAME}-{bb}_prop-{prop}_scope-{scope}_propagg-{property_agg}_corragg-{cor_agg}_bbagg-{bb_agg}" # noqa: E501 ) names = sorted(names) diff --git a/src/mofdscribe/featurizers/graph/__init__.py b/src/mofdscribe/featurizers/graph/__init__.py new file mode 100644 index 00000000..5a4e7539 --- /dev/null +++ b/src/mofdscribe/featurizers/graph/__init__.py @@ -0,0 +1 @@ +from .dimensionality import Dimensionality # noqa: F401 diff --git a/src/mofdscribe/featurizers/graph/dimensionality.py b/src/mofdscribe/featurizers/graph/dimensionality.py new file mode 100644 index 00000000..ff6814e8 --- /dev/null +++ b/src/mofdscribe/featurizers/graph/dimensionality.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +"""Compute the bond-topological dimensionality.""" +from typing import List + +import numpy as np +from pymatgen.analysis.dimensionality import get_dimensionality_larsen +from pymatgen.analysis.graphs import StructureGraph + +from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_structuregraph +from mofdscribe.mof import MOF + + +@operates_on_structuregraph +class Dimensionality(GraphFeaturizer): + def __init__(self) -> None: + """Construct a new Dimensionality featurizer.""" + super().__init__() + + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure_graph) + + def _featurize(self, structure_graph: StructureGraph) -> np.ndarray: + return np.array([get_dimensionality_larsen(structure_graph)]) + + def feature_labels(self) -> List[str]: + return ["dimensionality"] + + def citations(self) -> List[str]: + return [ + "@article{Larsen_2019," + "doi = {10.1103/physrevmaterials.3.034003}," + "url = {https://doi.org/10.1103%2Fphysrevmaterials.3.034003}," + "year = 2019," + "month = {mar}," + "publisher = {American Physical Society ({APS})}," + "volume = {3}," + "number = {3}," + "author = {Peter Mahler Larsen and Mohnish Pandey and Mikkel Strange and Karsten Wedel Jacobsen}," + "title = {Definition of a scoring parameter to identify low-dimensional materials components}," + "journal = {Phys. Rev. Materials}" + "}" + ] + + def implementors(self) -> List[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/graph/graphfeaturizer.py b/src/mofdscribe/featurizers/graph/graphfeaturizer.py new file mode 100644 index 00000000..20f3520b --- /dev/null +++ b/src/mofdscribe/featurizers/graph/graphfeaturizer.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +"""Base class for featurizers that operate on graphs.""" +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_structuregraph + +# operates on needs to be extended to graphs + + +# cannot use here the MOFBaseFeaturizer because the input is not always a structure +# not super sure about this yet, but I think, over the long run, I'd like this to only accept graphs +# i'd like to have some kind of transformer objects that handle the conversion from structure to graph +# if needed +# all of this could then be orchestrated in a pipeline object +@operates_on_structuregraph +class GraphFeaturizer(MOFBaseFeaturizer): + """Base class for graph featurizers.""" + + pass diff --git a/src/mofdscribe/featurizers/graph/numhops.py b/src/mofdscribe/featurizers/graph/numhops.py new file mode 100644 index 00000000..7540aec3 --- /dev/null +++ b/src/mofdscribe/featurizers/graph/numhops.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +"""Featurizer that computes statistics on the number of hops between binding/branching sites.""" + +from typing import Collection, Tuple, Union + +import networkx as nx +import numpy as np +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph + +from mofdscribe.featurizers.graph.graphfeaturizer import GraphFeaturizer +from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.featurizers.utils.extend import ( + operates_on_moleculegraph, + operates_on_structuregraph, +) +from mofdscribe.mof import MOF + + +def _hop_stats( + structuregraph: Union[StructureGraph, MoleculeGraph], indices: Collection[int], aggregations +) -> np.ndarray: + # for all combinations of indices compute the shortest path + # and return the statistics of the shortest paths + shortest_paths = [] + undirected = structuregraph.graph.to_undirected() + for i_count, i in enumerate(indices): + for j_count, j in enumerate(indices): + if i_count > j_count: + shortest_paths.append(nx.shortest_path_length(undirected, i, j)) + return np.array([ARRAY_AGGREGATORS[agg](shortest_paths) for agg in aggregations]) + + +@operates_on_structuregraph +@operates_on_moleculegraph +class NumHops(GraphFeaturizer): + """Featurizer that computes statistics on the shortest path lengths between sites.""" + + _NAME = "num_hops" + + def __init__(self, aggregations: Tuple[str] = ("mean", "std", "min", "max")) -> None: + """Construct a NumHops featurizer. + + Args: + aggregations: The aggregations to compute on the shortest path lengths. + Defaults to ("mean", "std", "min", "max"). + """ + self.aggregations = aggregations + + def _featurize( + self, structuregraph: Union[StructureGraph, MoleculeGraph], indices: Collection[int] + ) -> np.ndarray: + # for all combinations of indices compute the shortest path + # and return the statistics of the shortest paths + return _hop_stats(structuregraph, indices, self.aggregations) + + def featurize(self, mof: MOF, indices: Collection[int]) -> np.ndarray: + return self._featurize(mof.structure_graph, indices) + + def feature_labels(self) -> Collection[str]: + return [f"{self._NAME}_{agg}" for agg in self.aggregations] + + def citations(self) -> Collection[str]: + return [] + + def implementors(self) -> Collection[str]: + return ["Kevin Maik Jablonka"] diff --git a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py index d486f277..b779786b 100644 --- a/src/mofdscribe/featurizers/hostguest/guest_aprdf.py +++ b/src/mofdscribe/featurizers/hostguest/guest_aprdf.py @@ -1,21 +1,21 @@ # -*- coding: utf-8 -*- """Guest-centered atomic-property weighted autocorrelation function.""" from functools import cached_property -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union import numpy as np from element_coder import encode -from matminer.featurizers.base import BaseFeaturizer -from pymatgen.core import IStructure, Structure +from pymatgen.core import IStructure, Molecule, Structure -from mofdscribe.featurizers.hostguest.utils import HostGuest, _extract_host_guest - -from ..utils.aggregators import AGGREGATORS +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.hostguest.utils import _extract_host_guest +from mofdscribe.featurizers.utils.aggregators import AGGREGATORS +from mofdscribe.mof import MOF __all__ = ["GuestCenteredAPRDF"] -class GuestCenteredAPRDF(BaseFeaturizer): +class GuestCenteredAPRDF(MOFBaseFeaturizer): """Guest-centered atomic-property weighted autocorrelation function. This is a modification of the AP-RDF that is centered on the guest atoms. @@ -86,29 +86,27 @@ def _get_feature_labels(self): def _extract_host_guest( self, - structure: Optional[Union[Structure, IStructure]] = None, - host_guest: Optional[HostGuest] = None, + structure: Union[Structure, IStructure], ): return _extract_host_guest( structure=structure, - host_guest=host_guest, remove_guests=True, - operates_on="molecule", + operates_on=set([Molecule]), local_env_method=self._local_env_method, ) - def featurize( + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(structure=mof.structure) + + def _featurize( self, - structure: Optional[Union[Structure, IStructure]], - host_guest: Optional[HostGuest] = None, + structure: Union[Structure, IStructure], ) -> np.ndarray: """ Compute the features of the host and the guests and aggregate them. Args: - structure (Optional[Union[Structure, IStructure]]): The structure to featurize. - host_guest (Optional[HostGuest]): The host_guest to featurize. - If you provide this, you must not provide structure. + structure (Union[Structure, IStructure]): The structure to featurize. Returns: np.ndarray: The features of the host and the guests. @@ -116,7 +114,7 @@ def featurize( Raises: ValueError: If we cannot detect a host. """ - host_guest = self._extract_host_guest(structure=structure, host_guest=host_guest) + host_guest = self._extract_host_guest(structure=structure) if not host_guest.host: raise ValueError( diff --git a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py index 05d92ed4..f4c97249 100644 --- a/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py +++ b/src/mofdscribe/featurizers/hostguest/host_guest_featurizer.py @@ -1,20 +1,22 @@ # -*- coding: utf-8 -*- """Compute features on the host and the guests and then aggregate them.""" -from typing import Collection, List, Optional, Tuple, Union +from typing import Collection, List, Tuple, Union import numpy as np from loguru import logger from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IStructure, Structure -from mofdscribe.featurizers.hostguest.utils import HostGuest, _extract_host_guest +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.hostguest.utils import _extract_host_guest from mofdscribe.featurizers.utils import set_operates_on from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS +from mofdscribe.mof import MOF # ToDo: How do we handle if there is no guest? # make it optional to remove guests from the structure? -class HostGuestFeaturizer(BaseFeaturizer): +class HostGuestFeaturizer(MOFBaseFeaturizer): """ Compoute features on the host and the guests and then aggregate them. @@ -83,59 +85,52 @@ def feature_labels(self) -> List[str]: labels.append(f"{bb}_{aggregation}_{label}") return labels - def _extract_host_guest( - self, - structure: Optional[Union[Structure, IStructure]] = None, - host_guest: Optional[HostGuest] = None, - ): + def _extract_host_guest(self, structure: Union[Structure, IStructure]): return _extract_host_guest( structure=structure, - host_guest=host_guest, remove_guests=self._remove_guests, operates_on=self._operates_on, local_env_method=self._local_env_method, ) - def fit( + def fit(self, mofs: Collection[MOF]): + """ + Fit the featurizer to the given MOFs. + + Args: + mofs (Collection[MOF]): The MOFs to fit to. + """ + structures = [mof.structure for mof in mofs] + self._fit(structures=structures) + + def _fit( self, structures: Collection[Union[Structure, IStructure]], - host_guests: Optional[Collection[HostGuest]] = None, ) -> None: """ Fit the featurizer to the given structures. Args: structures (Collection[Union[Structure, IStructure]]): The structures to fit to. - host_guests (Optional[Collection[HostGuest]]): The host_guests to fit to. - If you provide this, you must not provide structures. """ all_hosts, all_guests = [], [] - if structures is not None: - for structure in structures: - host_guest = self._extract_host_guest(structure=structure) - all_hosts.append(host_guest.host) - all_guests.extend(host_guest.guests) + for structure in structures: + host_guest = self._extract_host_guest(structure=structure) + all_hosts.append(host_guest.host) + all_guests.extend(host_guest.guests) - if host_guests is not None: - for host_guest in host_guests: - host_guest = self._extract_host_guest(host_guest=host_guest) - all_hosts.append(host_guest.host) - all_guests.extend(host_guest.guests) - self._featurizer.fit(all_hosts + all_guests) + self._featurizer._fit(all_hosts + all_guests) - def featurize( - self, - structure: Optional[Union[Structure, IStructure]], - host_guest: Optional[HostGuest] = None, - ) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(structure=mof.structure) + + def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray: """ Compute the features of the host and the guests and aggregate them. Args: - structure (Optional[Union[Structure, IStructure]]): The structure to featurize. - host_guest (Optional[HostGuest]): The host_guest to featurize. - If you provide this, you must not provide structure. + structure (Union[Structure, IStructure]): The structure to featurize. Returns: np.ndarray: The features of the host and the guests. @@ -143,7 +138,7 @@ def featurize( Raises: ValueError: If we cannot detect a host. """ - host_guest = self._extract_host_guest(structure=structure, host_guest=host_guest) + host_guest = self._extract_host_guest(structure=structure) if host_guest.host is None: raise ValueError( @@ -156,10 +151,10 @@ def featurize( ) guest_feats = [] - host_feats = self._featurizer.featurize(host_guest.host) + host_feats = self._featurizer._featurize(host_guest.host) for guest in host_guest.guests: - guest_feats.append(self._featurizer.featurize(guest)) + guest_feats.append(self._featurizer._featurize(guest)) aggregated_feats = [] for aggregation in self._aggregations: diff --git a/src/mofdscribe/featurizers/hostguest/utils.py b/src/mofdscribe/featurizers/hostguest/utils.py index 9c4cb7df..28e72e8a 100644 --- a/src/mofdscribe/featurizers/hostguest/utils.py +++ b/src/mofdscribe/featurizers/hostguest/utils.py @@ -38,34 +38,27 @@ class HostGuest(BaseModel): def _extract_host_guest( - structure: Optional[Union[Structure, IStructure]] = None, - host_guest: Optional[HostGuest] = None, - operates_on: str = "structure", + structure: Union[Structure, IStructure], + operates_on: Optional[str] = None, remove_guests: bool = True, local_env_method: str = "vesta", ): - if structure is None and host_guest is None: - raise ValueError("You must provide a structure or host_guest.") - - if structure is not None: - if not isinstance(structure, (IStructure)): - structure = IStructure.from_sites(structure.sites) - structure_graph = get_structure_graph(structure, local_env_method) - mols, _mol_graphs, mol_indices, _centers, _coordinates = get_subgraphs_as_molecules( - structure_graph - ) - if remove_guests: - host = remove_guests_from_structure(structure_graph.structure, mol_indices) - else: - host = structure_graph.structure - - if operates_on == "structure": - mols = [boxed_molecule(mol) for mol in mols] - + if operates_on is None: + operates_on = set([Structure]) + + if not isinstance(structure, (IStructure)): + structure = IStructure.from_sites(structure.sites) + structure_graph = get_structure_graph(structure, local_env_method) + mols, _mol_graphs, mol_indices, _centers, _coordinates = get_subgraphs_as_molecules( + structure_graph + ) + if remove_guests: + host = remove_guests_from_structure(structure_graph.structure, mol_indices) else: - host = host_guest.host - mols = host_guest.guests - if operates_on == "structure" and isinstance(mols[0], (Molecule, IMolecule)): - mols = [boxed_molecule(mol) for mol in mols] + host = structure_graph.structure + + # ToDo: generalize this to other types + if operates_on & set([Structure]): + mols = [boxed_molecule(mol) for mol in mols] return HostGuest(host=host, guests=mols) diff --git a/src/mofdscribe/featurizers/matmineradapter.py b/src/mofdscribe/featurizers/matmineradapter.py new file mode 100644 index 00000000..91702df9 --- /dev/null +++ b/src/mofdscribe/featurizers/matmineradapter.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +"""Use matminer featurizers in mofdscribe.""" +from typing import List + +import numpy as np +from matminer.featurizers.base import BaseFeaturizer +from pymatgen.core.structure import Structure + +from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.utils.extend import operates_on_structure +from mofdscribe.mof import MOF + + +@operates_on_structure +class MatminerAdapter(MOFBaseFeaturizer): + """MatminerAdapter is a wrapper for matminer featurizers to be used in mofdscribe. + + That is, you can then pass :py:class:`mofdscribe.mof.MOF` objects to the + featurizer. + + .. note:: + If your matminer featurizer needs an expensive + computation, e.g., the structure graph, if cannot reuse the + one computed on the ``MOF`` object. + + Example: + >>> from mofdscribe.featurizers.matmineradapter import MatminerAdapter + >>> from matminer.featurizers.structure import DensityFeatures + >>> from mofdscribe.mof import MOF + >>> from pymatgen.core.structure import Structure + >>> hkust_structure = Structure.from_file("tests/data/hkust-1.cif") + >>> matminer_featurizer = DensityFeatures() + >>> adapter = MatminerAdapter(matminer_featurizer) + >>> features = adapter.featurize(MOF(hkust_structure)) + >>> original_features = matminer_featurizer.featurize(hkust_structure) + >>> np.allclose(features, original_features) + True + """ + + def __init__(self, matminer_featurizer: BaseFeaturizer): + """Construct a MatminerAdapter. + + Args: + matminer_featurizer (BaseFeaturizer): A matminer featurizer. + """ + self.matminer_featurizer = matminer_featurizer + + def _featurize(self, structure: Structure): + """Call the featurize method of the matminer featurizer.""" + return self.matminer_featurizer.featurize(structure) + + def featurize(self, mof: MOF) -> np.ndarray: + """Call the featurize method of the matminer featurizer using a MOF as input. + + Args: + mof (MOF): A MOF object. + + Returns: + np.ndarray: The features. + """ + return self._featurize(mof.structure) + + def _fit(self, structures: List[Structure]) -> np.ndarray: + """Call the fit method of the matminer featurizer.""" + self.matminer_featurizer.fit( + [Structure.from_sites(structure.sites) for structure in structures] + ) + + def fit(self, mofs: List[MOF]): + """Call the fit method of the matminer featurizer using a list of MOFs as input. + + Args: + mofs (List[MOF]): A list of MOF objects. + """ + self._fit([Structure.from_sites(mof.structure.sites) for mof in mofs]) + + def feature_labels(self): + """Call the feature_labels method of the matminer featurizer.""" + return self.matminer_featurizer.feature_labels() + + def citations(self): + """Call the citations method of the matminer featurizer.""" + return self.matminer_featurizer.citations() + + def implementors(self): + """Call the implementors method of the matminer featurizer.""" + return self.matminer_featurizer.implementors() diff --git a/src/mofdscribe/featurizers/pore/geometric_properties.py b/src/mofdscribe/featurizers/pore/geometric_properties.py index e354d82f..3f36804e 100644 --- a/src/mofdscribe/featurizers/pore/geometric_properties.py +++ b/src/mofdscribe/featurizers/pore/geometric_properties.py @@ -1,5 +1,11 @@ # -*- coding: utf-8 -*- -"""Computing pore properties using Zeo++.""" +"""Computing pore properties using Zeo++. + +.. warning:: + + Note that we found that the results from zeo++ might depend on the + operating system. See https://github.com/lsmo-epfl/zeopp-lsmo/issues/18. +""" import os import re import subprocess @@ -10,12 +16,13 @@ import numpy as np import pandas as pd from loguru import logger -from pymatgen.core import IStructure, Structure +from pymatgen.core import Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer - -from ..utils import is_tool -from ..utils.tempdir import TEMPDIR +from mofdscribe.featurizers.utils import is_tool +from mofdscribe.featurizers.utils.tempdir import TEMPDIR +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType ZEOPP_BASE_COMMAND = ["network"] HA_COMMAND = ["-ha"] @@ -175,7 +182,6 @@ class PoreDiameters(MOFBaseFeaturizer): def __init__( self, ha: bool = True, - primitive: bool = True, ): """Initialize the featurizer. @@ -185,14 +191,14 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.labels = ["lis", "lifs", "lifsp"] self.ha = ha - super().__init__(primitive=primitive) - def _featurize(self, s): + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType): result = run_zeopp(s, ["-res"], _parse_res_zeopp, self.ha) return np.array(list(result.values())) @@ -229,7 +235,6 @@ def __init__( num_samples: int = 100, channel_radius: Union[str, float, None] = None, ha: bool = True, - primitive: bool = True, ): """Initialize the SurfaceArea featurizer. @@ -245,8 +250,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( @@ -278,9 +281,11 @@ def __init__( ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] - super().__init__(primitive=primitive) - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = [ "-sa", f"{self.channel_radius}", @@ -323,7 +328,6 @@ def __init__( num_samples: int = 100, channel_radius: Union[str, float, None] = None, ha: bool = True, - primitive: bool = True, ): """Initialize the AccessibleVolume featurizer. @@ -339,8 +343,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( @@ -370,9 +372,11 @@ def __init__( "nav_cm3g", ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] - super().__init__(primitive=primitive) - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = ["-vol", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}"] results = run_zeopp(s, command, _parse_volpo_zeopp, self.ha) return np.array(list(results.values())) @@ -439,7 +443,6 @@ def __init__( num_samples: int = 50000, channel_radius: Optional[Union[str, float]] = None, ha: bool = True, - primitive: bool = True, ) -> None: """Initialize the RayTracingHistogram featurizer. @@ -460,8 +463,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( @@ -480,12 +481,14 @@ def __init__( self.num_samples = num_samples self.channel_radius = channel_radius self.ha = ha - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: return [f"ray_hist_{self.probe_radius}_{i}" for i in range(1000)] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = [ "-ray_atom", f"{self.channel_radius}", @@ -562,7 +565,6 @@ def __init__( channel_radius: Optional[Union[str, float]] = None, hist_type: str = "derivative", ha: bool = False, - primitive: bool = True, ) -> None: """Initialize the PoreSizeDistribution featurizer. @@ -585,8 +587,6 @@ def __init__( It has been reported that this can lead to issues for some structures. Default is True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. Raises: ValueError: If type not one of 'count', 'cumulative', 'derivative'. @@ -618,12 +618,14 @@ def __init__( self.num_samples = num_samples self.channel_radius = channel_radius self.ha = ha - super().__init__(primitive=primitive) def feature_labels(self) -> List[str]: return [f"psd_hist_{self.probe_radius}_{i}" for i in range(1000)] - def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, s: StructureIStructureType) -> np.ndarray: command = [ "-psd", f"{self.channel_radius}", diff --git a/src/mofdscribe/featurizers/pore/voxelgrid.py b/src/mofdscribe/featurizers/pore/voxelgrid.py index 490a8341..9007f6af 100644 --- a/src/mofdscribe/featurizers/pore/voxelgrid.py +++ b/src/mofdscribe/featurizers/pore/voxelgrid.py @@ -1,14 +1,15 @@ # -*- coding: utf-8 -*- """Featurizer that computes 3D voxelgrids.""" -from typing import List, Tuple, Union +from typing import List, Tuple import numpy as np from element_coder import encode -from pymatgen.core import Element, IStructure, Structure +from pymatgen.core import Element from mofdscribe.featurizers.base import MOFBaseFeaturizer - -from ._voxelgrid import VoxelGrid as VGBase +from mofdscribe.featurizers.pore._voxelgrid import VoxelGrid as VGBase +from mofdscribe.mof import MOF +from mofdscribe.types import StructureIStructureType def make_supercell( @@ -114,7 +115,6 @@ def __init__( properties: Tuple[str, int] = ("X", "electron_affinity"), flatten: bool = True, regular_bounding_box: bool = True, - primitive: bool = False, ): """Initialize a VoxelGrid featurizer. @@ -141,8 +141,6 @@ def __init__( regular_bounding_box (bool): If True, the bounding box of the point cloud will be adjusted in order to have all the dimensions of equal length. Defaults to True. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to True. """ self.min_size = min_size self.n_x = n_x @@ -153,9 +151,11 @@ def __init__( self.geometry_aggregations = geometry_aggregations self.flatten = flatten self.regular_bounding_box = regular_bounding_box - super().__init__(primitive=primitive) - def _featurize(self, structure: Union[IStructure, Structure]) -> np.ndarray: + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + + def _featurize(self, structure: StructureIStructureType) -> np.ndarray: coords, numbers = make_supercell( structure.cart_coords, structure.atomic_numbers, structure.lattice.matrix, self.min_size ) diff --git a/src/mofdscribe/featurizers/topology/atom_centered_ph.py b/src/mofdscribe/featurizers/topology/atom_centered_ph.py index 69f5c52a..e14b4cb9 100644 --- a/src/mofdscribe/featurizers/topology/atom_centered_ph.py +++ b/src/mofdscribe/featurizers/topology/atom_centered_ph.py @@ -5,22 +5,25 @@ import numpy as np from element_coder import encode_many -from matminer.featurizers.base import BaseFeaturizer from pymatgen.core import IStructure, Structure -from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.base import MOFBaseFeaturizer, MOFBaseSiteFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import ( + construct_pds_cached, + diagrams_to_bd_arrays, + persistent_diagram_stats, +) from mofdscribe.featurizers.utils import flatten from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure - -from ._tda_helpers import construct_pds_cached, diagrams_to_bd_arrays, persistent_diagram_stats +from mofdscribe.mof import MOF # Todo: allow doing this with cutoff and coordination shells # ToDo: check if this works with molecules @operates_on_istructure @operates_on_structure -class AtomCenteredPHSite(BaseFeaturizer): +class AtomCenteredPHSite(MOFBaseSiteFeaturizer): """Site featurizer for atom-centered statistics of persistence diagrams. This featurizer is an abstraction of the on described in the work of Jiang @@ -71,7 +74,19 @@ def __init__( self.dimensions = dimensions self.alpha_weight = alpha_weight - def featurize(self, s: Union[Structure, IStructure], idx: int) -> np.ndarray: + def featurize(self, mof: MOF, idx: int) -> np.ndarray: + """Compute the features for a single site. + + Args: + mof (MOF): MOF to featurize. + idx (int): Index of site to featurize. + + Returns: + np.ndarray: Features for the site. + """ + return self._featurize(mof.structure, idx) + + def _featurize(self, s: Union[Structure, IStructure], idx: int) -> np.ndarray: neighbors = s.get_neighbors(s[idx], self.cutoff) neighbor_structure = IStructure.from_sites(neighbors) if self.alpha_weight is not None: @@ -160,7 +175,6 @@ def __init__( species_aggregation_functions: Tuple[str] = ("min", "max", "mean", "std"), cutoff: float = 12, dimensions: Tuple[int] = (1, 2), - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """ @@ -187,8 +201,6 @@ def __init__( Defaults to 12. dimensions (Tuple[int]): Betti numbers of consider. 0 describes isolated components, 1 cycles and 2 cavities. Defaults to (1, 2). - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -210,18 +222,19 @@ def __init__( ) self.compute_for_all_elements = compute_for_all_elements - super().__init__(primitive=primitive) - def _get_relevant_atom_type(self, element: str) -> str: for atom_type in self.atom_types: if element in atom_type: return atom_type + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: results = defaultdict(list) for idx, site in enumerate(s): atom_type = self._get_relevant_atom_type(site.specie.symbol) - features = self.site_featurizer.featurize(s, idx) + features = self.site_featurizer._featurize(s, idx) if atom_type is not None: results[atom_type].append(features) results["all"].append(features) diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py index 74d1cfcc..0fbdfa42 100644 --- a/src/mofdscribe/featurizers/topology/ph_hist.py +++ b/src/mofdscribe/featurizers/topology/ph_hist.py @@ -7,14 +7,14 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import get_diagrams_for_structure from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, operates_on_istructure, operates_on_molecule, operates_on_structure, ) - -from ._tda_helpers import get_diagrams_for_structure +from mofdscribe.mof import MOF @operates_on_imolecule @@ -51,7 +51,6 @@ def __init__( y_axis_label: str = "persistence", normed: bool = True, no_supercell: bool = False, - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Initialize the PHStats object. @@ -81,8 +80,6 @@ def __init__( Defaults to True. no_supercell (bool): If true, then the supercell is not created. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -102,7 +99,6 @@ def __init__( self.normed = normed self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -119,6 +115,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py index 9a406d9c..54f59545 100644 --- a/src/mofdscribe/featurizers/topology/ph_image.py +++ b/src/mofdscribe/featurizers/topology/ph_image.py @@ -7,17 +7,17 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import ( + get_persistence_image_limits_for_structure, + get_persistent_images_for_structure, +) from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, operates_on_istructure, operates_on_molecule, operates_on_structure, ) - -from ._tda_helpers import ( - get_persistence_image_limits_for_structure, - get_persistent_images_for_structure, -) +from mofdscribe.mof import MOF @operates_on_imolecule @@ -72,7 +72,6 @@ def __init__( max_fit_tolerence: float = 0.1, periodic: bool = False, no_supercell: bool = False, - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Construct a PHImage object. @@ -112,8 +111,6 @@ def __init__( in the analysis (experimental!). Defaults to False. no_supercell (bool): If true, then the supercell is not created. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -161,8 +158,6 @@ def __init__( self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) - def _get_feature_labels(self) -> List[str]: labels = [] _elements = list(self.atom_types) @@ -179,6 +174,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: @@ -206,6 +204,18 @@ def _featurize( features.append(np.array(results["image"][element][dim]).flatten()) return np.concatenate(features) + def fit(self, mofs: List[MOF]) -> None: + """Use structures to estimate the settings for the featurizer. + + Find the limits (maximum/minimum birth/death and persistence) + for all the structures in the dataset and store them in the object. + + Args: + mofs (List[MOF]): List of MOFs to find the limits for. + """ + structures = [mof.structure for mof in mofs] + self._fit(structures) + def _fit(self, structures: List[Union[Structure, IStructure, Molecule, IMolecule]]) -> None: """Use structures to estimate the settings for the featurizer. diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py index 60dc70fb..650d358a 100644 --- a/src/mofdscribe/featurizers/topology/ph_stats.py +++ b/src/mofdscribe/featurizers/topology/ph_stats.py @@ -6,6 +6,10 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import ( + get_diagrams_for_structure, + persistent_diagram_stats, +) from mofdscribe.featurizers.utils import flatten from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, @@ -13,8 +17,7 @@ operates_on_molecule, operates_on_structure, ) - -from ._tda_helpers import get_diagrams_for_structure, persistent_diagram_stats +from mofdscribe.mof import MOF @operates_on_imolecule @@ -48,7 +51,6 @@ def __init__( aggregation_functions: Tuple[str] = ("min", "max", "mean", "std"), periodic: bool = False, no_supercell: bool = False, - primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Initialize the PHStats object. @@ -73,8 +75,6 @@ def __init__( Defaults to False. no_supercell (bool): If true, then the supercell is not created. Defaults to False. - primitive (bool): If True, the structure is reduced to its primitive - form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. @@ -91,7 +91,6 @@ def __init__( self.periodic = periodic self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -106,6 +105,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: diff --git a/src/mofdscribe/featurizers/topology/ph_vect.py b/src/mofdscribe/featurizers/topology/ph_vect.py index 7566f793..02135c2c 100644 --- a/src/mofdscribe/featurizers/topology/ph_vect.py +++ b/src/mofdscribe/featurizers/topology/ph_vect.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Featurizers using persistent homology -- vectorized using Gaussian mixture models.""" from collections import defaultdict -from typing import List, Optional, Tuple, Union +from typing import Collection, List, Optional, Tuple, Union import numpy as np from loguru import logger @@ -9,14 +9,14 @@ from pymatgen.core import IMolecule, IStructure, Molecule, Structure from mofdscribe.featurizers.base import MOFBaseFeaturizer +from mofdscribe.featurizers.topology._tda_helpers import get_diagrams_for_structure from mofdscribe.featurizers.utils.extend import ( operates_on_imolecule, operates_on_istructure, operates_on_molecule, operates_on_structure, ) - -from ._tda_helpers import get_diagrams_for_structure +from mofdscribe.mof import MOF def _apply_and_fill(transformer_func, diagrams): @@ -237,7 +237,6 @@ def __init__( self.periodic = periodic self.no_supercell = no_supercell self.alpha_weight = alpha_weight - super().__init__(primitive=primitive) def _get_feature_labels(self) -> List[str]: labels = [] @@ -251,6 +250,9 @@ def _get_feature_labels(self) -> List[str]: def feature_labels(self) -> List[str]: return self._get_feature_labels() + def featurize(self, mof: MOF) -> np.ndarray: + return self._featurize(mof.structure) + def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: @@ -269,9 +271,15 @@ def _featurize( compiled_results = self._reshape_results(res, 1).flatten() return compiled_results - def fit(self, structures: Union[Structure, IStructure, Molecule, IMolecule]) -> "PHVect": - if self.primitive: - structures = self._get_primitive_many(structures) + def fit(self, mofs: Collection["MOF"]): + """Fit the featurizer to a collection of MOFs. + + Args: + mofs (Collection[MOF]): A collection of MOFs to fit the featurizer to. + """ + self._fit([mof.structure for mof in mofs]) + + def _fit(self, structures: Union[Structure, IStructure, Molecule, IMolecule]) -> "PHVect": self.transformers, _ = _fit_transform_structures( self.transformers, structures, @@ -294,11 +302,20 @@ def _reshape_results(self, results, num_structures) -> np.ndarray: n_col += self.n_components return compiled_results - def fit_transform( + def fit_transform(self, mofs: Collection["MOF"]) -> np.ndarray: + """Fit the featurizer to a collection of MOFs and return the featurized values. + + Args: + mofs (Collection[MOF]): A collection of MOFs to fit the featurizer to. + + Returns: + np.ndarray: The featurized values. + """ + return self._fit_transform([mof.structure for mof in mofs]) + + def _fit_transform( self, structures: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: - if self.primitive: - structures = self._get_primitive_many(structures) self.transformers, results = _fit_transform_structures( self.transformers, structures, diff --git a/src/mofdscribe/featurizers/utils/__init__.py b/src/mofdscribe/featurizers/utils/__init__.py index 57db31df..fa7b1c78 100644 --- a/src/mofdscribe/featurizers/utils/__init__.py +++ b/src/mofdscribe/featurizers/utils/__init__.py @@ -5,7 +5,7 @@ from shutil import which import numpy as np -from pymatgen.core import Molecule, Structure +from pymatgen.core import Structure if sys.version_info.major == 3 and sys.version_info.minor >= 10: from collections.abc import MutableMapping @@ -45,12 +45,6 @@ def flatten(d: dict, parent_key: str = "", sep: str = ".") -> dict: def set_operates_on(self, featurizer): try: _operates_on = featurizer.operates_on() - if (Structure in _operates_on) and (Molecule in _operates_on): - self._operates_on = "both" - elif Structure in _operates_on: - self._operates_on = "structure" - elif Molecule in _operates_on: - self._operates_on = "molecule" - + self._operates_on = set(_operates_on) except AttributeError: - self._operates_on = "structure" + self._operates_on = {Structure} diff --git a/src/mofdscribe/featurizers/utils/aggregators.py b/src/mofdscribe/featurizers/utils/aggregators.py index 01628776..ae3fa976 100644 --- a/src/mofdscribe/featurizers/utils/aggregators.py +++ b/src/mofdscribe/featurizers/utils/aggregators.py @@ -48,6 +48,13 @@ def masked_trimean(values): return (q1 + 2 * q2 + q3) / 4 +def try_except_nan(func, *args, **kwargs): + try: + return func(*args, **kwargs) + except Exception: + return np.nan + + AGGREGATORS = { "sum": lambda x: x[0] + x[1], "avg": lambda x: (x[0] + x[1]) / 2, @@ -62,36 +69,38 @@ def masked_trimean(values): ARRAY_AGGREGATORS = { - "sum": lambda x, **kwargs: np.sum(x, **kwargs), - "avg": lambda x, **kwargs: np.mean(x, **kwargs), - "max": lambda x, **kwargs: np.max(x, **kwargs), - "min": lambda x, **kwargs: np.min(x, **kwargs), - "std": lambda x, **kwargs: np.std(x, **kwargs), - "range": lambda x, **kwargs: np.max(x, **kwargs) - np.min(x, **kwargs), - "mean": lambda x, **kwargs: np.mean(x, **kwargs), - "median": lambda x, **kwargs: np.median(x, **kwargs), - "geomean": lambda x, **kwargs: gmean(x, **kwargs), - "harmean": lambda x, **kwargs: hmean(x, **kwargs), - "mad": lambda x, **kwargs: mad(x, **kwargs), - "trimean": lambda x, **kwargs: trimean(x, **kwargs), - "inf": lambda x, **kwargs: np.linalg.norm(x, ord=np.inf, **kwargs), - "manhattan": lambda x, **kwargs: np.linalg.norm(x, ord=1, **kwargs), + "sum": lambda x, **kwargs: try_except_nan(np.sum, x, **kwargs), + "avg": lambda x, **kwargs: try_except_nan(np.mean, x, **kwargs), + "max": lambda x, **kwargs: try_except_nan(np.max, x, **kwargs), + "min": lambda x, **kwargs: try_except_nan(np.min, x, **kwargs), + "std": lambda x, **kwargs: try_except_nan(np.std, x, **kwargs), + "range": lambda x, **kwargs: try_except_nan(np.max, x, **kwargs) + - try_except_nan(np.min, x, **kwargs), + "mean": lambda x, **kwargs: try_except_nan(np.mean, x, **kwargs), + "median": lambda x, **kwargs: try_except_nan(np.median, x, **kwargs), + "geomean": lambda x, **kwargs: try_except_nan(gmean, x, **kwargs), + "harmean": lambda x, **kwargs: try_except_nan(hmean, x, **kwargs), + "mad": lambda x, **kwargs: try_except_nan(mad, x, **kwargs), + "trimean": lambda x, **kwargs: try_except_nan(trimean, x, **kwargs), + "inf": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=np.inf, **kwargs), + "manhattan": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=1, **kwargs), } MA_ARRAY_AGGREGATORS = { - "sum": lambda x, **kwargs: np.ma.sum(x, **kwargs), - "avg": lambda x, **kwargs: np.ma.mean(x, **kwargs), - "max": lambda x, **kwargs: np.ma.max(x, **kwargs), - "min": lambda x, **kwargs: np.ma.min(x, **kwargs), - "std": lambda x, **kwargs: np.ma.std(x, **kwargs), - "range": lambda x, **kwargs: np.ma.max(x, **kwargs) - np.ma.min(x, **kwargs), - "mean": lambda x, **kwargs: np.ma.mean(x, **kwargs), - "median": lambda x, **kwargs: np.ma.median(x, **kwargs), - "geomean": lambda x, **kwargs: mstast_gmean(x, **kwargs), - "harmean": lambda x, **kwargs: mstast_hmean(x, **kwargs), - "mad": lambda x, **kwargs: masked_mad(x, **kwargs), - "trimean": lambda x, **kwargs: masked_trimean(x, **kwargs), - "inf": lambda x, **kwargs: np.linalg.norm(x, ord=np.inf, **kwargs), - "manhattan": lambda x, **kwargs: np.linalg.norm(x, ord=1, **kwargs), + "sum": lambda x, **kwargs: try_except_nan(np.ma.sum, x, **kwargs), + "avg": lambda x, **kwargs: try_except_nan(np.ma.mean, x, **kwargs), + "max": lambda x, **kwargs: try_except_nan(np.ma.max, x, **kwargs), + "min": lambda x, **kwargs: try_except_nan(np.ma.min, x, **kwargs), + "std": lambda x, **kwargs: try_except_nan(np.ma.std, x, **kwargs), + "range": lambda x, **kwargs: try_except_nan(np.ma.max, x, **kwargs) + - try_except_nan(np.ma.min, x, **kwargs), + "mean": lambda x, **kwargs: try_except_nan(np.ma.mean, x, **kwargs), + "median": lambda x, **kwargs: try_except_nan(np.ma.median, x, **kwargs), + "geomean": lambda x, **kwargs: try_except_nan(mstast_gmean, x, **kwargs), + "harmean": lambda x, **kwargs: try_except_nan(mstast_hmean, x, **kwargs), + "mad": lambda x, **kwargs: try_except_nan(masked_mad, x, **kwargs), + "trimean": lambda x, **kwargs: try_except_nan(masked_trimean, x, **kwargs), + "inf": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=np.inf, **kwargs), + "manhattan": lambda x, **kwargs: try_except_nan(np.linalg.norm, x, ord=1, **kwargs), } diff --git a/src/mofdscribe/featurizers/utils/definitions.py b/src/mofdscribe/featurizers/utils/definitions.py new file mode 100644 index 00000000..d1bae93e --- /dev/null +++ b/src/mofdscribe/featurizers/utils/definitions.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +"""Constants used in the featurizers.""" +from element_coder.data.coding_data import get_coding_dict + +ALL_ELEMENTS = set(get_coding_dict("atomic").keys()) +ALL_ELEMENTS_EXCEPT_C_H = ALL_ELEMENTS - {"C", "H"} +ALL_METAL_ELEMENTS = set( + ( + "Li", + "Be", + "Na", + "Mg", + "Al", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", + "Rf", + "Db", + "Sg", + "Bh", + "Hs", + "Mt", + "Ds", + "Rg", + "Cn", + "Al", + "Ga", + "In", + "Tl", + "Ge", + "Sn", + "Pb", + "Sb", + "Bi", + "Po", + ) +) + +ALL_NONMETAL_ELEMENTS = ALL_ELEMENTS - ALL_METAL_ELEMENTS diff --git a/src/mofdscribe/featurizers/utils/extend.py b/src/mofdscribe/featurizers/utils/extend.py index 2f22f4a6..5d307996 100644 --- a/src/mofdscribe/featurizers/utils/extend.py +++ b/src/mofdscribe/featurizers/utils/extend.py @@ -3,6 +3,7 @@ from functools import partial from typing import Type +from pymatgen.analysis.graphs import MoleculeGraph, StructureGraph from pymatgen.core import IMolecule, IStructure, Molecule, Structure @@ -35,3 +36,5 @@ def operates_on(self): operates_on_istructure = partial(add_operates_on, type=IStructure) operates_on_molecule = partial(add_operates_on, type=Molecule) operates_on_imolecule = partial(add_operates_on, type=IMolecule) +operates_on_structuregraph = partial(add_operates_on, type=StructureGraph) +operates_on_moleculegraph = partial(add_operates_on, type=MoleculeGraph) diff --git a/src/mofdscribe/featurizers/utils/mixins.py b/src/mofdscribe/featurizers/utils/mixins.py new file mode 100644 index 00000000..d995456a --- /dev/null +++ b/src/mofdscribe/featurizers/utils/mixins.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +"""Mixin classes for featurizers.""" +import numpy as np + + +class GetGridMixin: + """Mixin class for getting a linearly spaced grid.""" + + def _get_grid(self, lower_bound, upper_bound, bin_size): + return np.arange(lower_bound, upper_bound, bin_size) diff --git a/src/mofdscribe/featurizers/utils/structure_graph.py b/src/mofdscribe/featurizers/utils/structure_graph.py index 8d8a150c..d7889eae 100644 --- a/src/mofdscribe/featurizers/utils/structure_graph.py +++ b/src/mofdscribe/featurizers/utils/structure_graph.py @@ -1,45 +1,62 @@ # -*- coding: utf-8 -*- """perform analyses on structure graphs.""" +from collections import defaultdict from functools import lru_cache -from typing import List, Optional, Set, Tuple +from typing import Dict, List, Optional +import networkx as nx from pymatgen.analysis.graphs import StructureGraph from pymatgen.core import IStructure from structuregraph_helpers.create import get_structure_graph as get_sg -def get_neighbors_at_distance( - structure_graph: StructureGraph, start: int, scope: int -) -> Tuple[Set[int], List[int]]: - """For a structure graph and a start site, return all sites within a certain\ - distance (scope) of the start site. +def leads_to_terminal(nx_graph, edge, bridges: Dict[int, int] = None): + if bridges is None: + bridges = _generate_bridges(nx_graph) + sorted_edge = sorted(edge) + try: + bridge_edge = bridges[sorted_edge[0]] + return sorted_edge[1] in bridge_edge + except KeyError: + return False + + +def _generate_bridges(nx_graph) -> Dict[int, int]: + + bridges = list(nx.bridges(nx_graph)) + + bridges_dict = defaultdict(list) + for key, value in bridges: + bridges_dict[key].append(value) + + return bridges_dict + + +def get_neighbors_up_to_scope(structure_graph, site_index, scope): + """Get only the neighbors at a certain scope. + + That is, scope=3 will return all neighbors three bonds away from the + site_index. Args: - structure_graph (StructureGraph): pymatgen StructureGraph object - start (int): starting atom - scope (int): distance to search + structure_graph (StructureGraph): The structure graph. + site_index (int): The site index. + scope (int): The scope. Returns: - Tuple[Set[int], List[int]]: All sites within the scope of the start - site, and the indices of the sites in the last shell + list: The list of neighbors. """ - # Todo: This code is stupid. - neighbors_at_last_level = [start] - all_neighbors = set() - neighbors_at_next_level = [] - for _ in range(scope): - for n in neighbors_at_last_level: - neighbors_at_next_level.extend(get_connected_site_indices(structure_graph, n)) - - all_neighbors.update(neighbors_at_last_level) - neighbors_at_last_level = neighbors_at_next_level - neighbors_at_next_level = [] - all_neighbors.remove(start) - neighbors_at_last_level = set(neighbors_at_last_level) - if start in neighbors_at_last_level: - neighbors_at_last_level.remove(start) - - return all_neighbors, neighbors_at_last_level + neighbors_in_scope = defaultdict(set) + neighbors_in_scope[0] = [site_index] + visited = set([site_index]) + for i in range(1, scope + 1): + for site in neighbors_in_scope[i - 1]: + neighbors = get_connected_site_indices(structure_graph, site) + neighbors = [n for n in neighbors if n not in visited] + neighbors_in_scope[i].update(neighbors) + visited.update(neighbors) + + return neighbors_in_scope @lru_cache() diff --git a/src/mofdscribe/mof.py b/src/mofdscribe/mof.py new file mode 100644 index 00000000..4cb43e75 --- /dev/null +++ b/src/mofdscribe/mof.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +"""MOF class - the container consumed by all featurizers.""" +from collections import namedtuple +from typing import Optional + +from backports.cached_property import cached_property +from pymatgen.analysis.graphs import StructureGraph +from pymatgen.core.structure import IStructure, Structure +from structuregraph_helpers.create import get_structure_graph +from structuregraph_helpers.hash import ( + decorated_graph_hash, + decorated_scaffold_hash, + undecorated_graph_hash, + undecorated_scaffold_hash, +) + +from mofdscribe.types import PathType, StructureIStructureType + + +class MOF: + """A container for a MOF structure. + + The MOF class is the container for a MOF structure. It contains at least + the structure itself, but can also contain additional information such as + + - the structure graph + - the fragments + - the hashes + + The MOF class is a conenient input for the featurizers because + they can simply take whatever they need from the MOF class. + Additionally, expensive computations such as the structure graph + or the fragments can be cached in the MOF class. + + This not only simplifies the featurizer code, but also makes it + possible to reuse the artifacts of the MOF class for other purposes. + + .. note:: + + By default, the MOF class will use :py:obj:`~pymatgen.core.structure.IStructure` + as the structure container. This is because we want to make sure that there is + no way tht the inputs are mutated during the featurization process. + + However, some aspects of ``matminer`` require the use of :py:obj:`~pymatgen.core.structure.Structure` + as the structure container. The corresponding matminer adapter will deal with this. + However, we cannot strictly enfore immutability of the structure container in this case. + """ + + def __init__( + self, + structure: StructureIStructureType, + structure_graph: Optional[StructureGraph] = None, + local_env_strategy: str = "vesta", + fragmentation_kwargs: Optional[dict] = None, + ): + """Initialize a MOF class. + + Args: + structure (StructureIStructureType): The structure of the MOF as + :py:class:`~pymatgen.core.Structure` or :py:class:`~pymatgen.core.IStructure`. + structure_graph (Optional[StructureGraph]): The structure graph of the MOF. + If not provided, it will be computed on the fly. + local_env_strategy (str): The local environment strategy to use for the structure graph. + Defaults to ``"vesta"``. + fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``. + Defaults to ``None``. + Some relevant kwargs are: + * check_dimensionality (bool): Check if the node is 0D. + If not, split into isolated metals. + Defaults to True. + * create_single_metal_bus (bool): Create a single metal BUs. + Defaults to False. + * break_organic_nodes_at_metal (bool): Break nodes into single metal BU + if they appear "too organic". + + For the dimensionality featurizers applied to building blocks, you want + to be careful wtih ``check_dimensionality``. + """ + self.__structure = ( + IStructure.from_sites(structure.sites) + if isinstance(structure, Structure) + else structure + ) + self.__structure_graph = structure_graph + self.__local_env_strategy = local_env_strategy + self.__fragmentation_kwargs = ( + fragmentation_kwargs if fragmentation_kwargs is not None else {} + ) + + @property + def structure(self) -> IStructure: + """Get the structure of the MOF.""" + return self.__structure + + @property + def structure_graph(self) -> StructureGraph: + """Get the structure graph of the MOF.""" + if self.__structure_graph is None: + self.__structure_graph = get_structure_graph( + self.__structure, self.__local_env_strategy + ) + return self.__structure_graph + + @classmethod + def from_file( + cls, path: PathType, primitive: bool = True, fragmentation_kwargs: Optional[dict] = None + ) -> "MOF": + """Create a MOF class from a file. + + Args: + path (PathType): The path to the file. + primitive (bool): Whether to use the primitive cell or not. + Defaults to ``True``. + fragmentation_kwargs: The fragmentation kwargs to use for the fragmentation using ``moffragmentor``. + + Returns: + MOF: The MOF class. + """ + return cls( + IStructure.from_file(path).get_primitive_structure() + if primitive + else IStructure.from_file(path), + fragmentation_kwargs=fragmentation_kwargs, + ) + + @cached_property + def fragments(self) -> namedtuple: + """ + Get the fragments of the MOF. + + Raises: + ImportError: If ``moffragmentor`` is not installed. + + Returns: + namedtuple: :py:class:`~moffragmentor.fragment.Fragments` namedtuple. + """ + try: + from moffragmentor import MOF as MOFFRAGMENTORMOF + except ImportError: + raise ImportError( + "moffragmentor is not installed. Please install it to use the fragments feature.\ + See https://github.com/kjappelbaum/moffragmentor for more information." + ) + + mof = MOFFRAGMENTORMOF(self.__structure, self.structure_graph) + return mof.fragment(**self.__fragmentation_kwargs) + + @cached_property + def decorated_graph_hash(self) -> str: + """Return the Weisfeiler-Lehman graph hash. + + Hashes are identical for isomorphic graphs + (taking the atomic kinds into account) + and there are guarantees that non-isomorphic graphs will get different hashes. + + Returns: + str: Graph hash + """ + return decorated_graph_hash(self.structure_graph, lqg=False) + + @cached_property + def decorated_scaffold_hash(self) -> str: + """Return the Weisfeiler-Lehman scaffold hash. + + The scaffold is the graph with the all terminal groups and + atoms removed (i.e., formally, bridges are broken). + + Returns: + str: Scaffold hash + """ + return decorated_scaffold_hash(self.structure_graph, lqg=False) + + @cached_property + def undecorated_graph_hash(self) -> str: + """Return the Weisfeiler-Lehman graph hash. + + Hashes are identical for isomorphic graphs + and there are guarantees that non-isomorphic graphs will get different hashes. + Undecorated means that the atomic numbers are not taken into account. + + Returns: + str: Graph hash + """ + return undecorated_graph_hash(self.structure_graph, lqg=False) + + @cached_property + def undecorated_scaffold_hash(self) -> str: + """Return the Weisfeiler-Lehman scaffold hash. + + The scaffold is the graph with the all terminal groups and + atoms removed (i.e., formally, bridges are broken). + Undecorated means that the atomic numbers are not taken into account. + + Returns: + str: Scaffold hash + """ + return undecorated_scaffold_hash(self.structure_graph, lqg=False) diff --git a/src/mofdscribe/types.py b/src/mofdscribe/types.py index 57ffcc83..ee6093c4 100644 --- a/src/mofdscribe/types.py +++ b/src/mofdscribe/types.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Union +from pymatgen.core.structure import IStructure, Structure from typing_extensions import TypeAlias PathType: TypeAlias = Union[str, Path] +StructureIStructureType: TypeAlias = Union[IStructure, Structure] diff --git a/tests/conftest.py b/tests/conftest.py index 72b08be2..c6a5e65e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,24 @@ def hkust_structure(): return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "HKUST-1.cif")) +@pytest.fixture(scope="session") +def cof_1_structure(): + """Return a pymatgen Structure for COF""" + return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "COF-1.cif")) + + +@pytest.fixture(scope="session") +def mof74_structure(): + """Return a pymatgen Structure for MOF74""" + return IStructure.from_file(os.path.join(THIS_DIR, "test_files", "Fe-MOF-74.cif")) + + +@pytest.fixture(scope="session") +def hkust_path(): + """Return a path to a HKUST structure""" + return os.path.join(THIS_DIR, "test_files", "HKUST-1.cif") + + @pytest.fixture(scope="session") def hkust_la_structure(): """Return a pymatgen Structure for HKUST""" diff --git a/tests/featurizers/bu/test_angle_hist_featurizer.py b/tests/featurizers/bu/test_angle_hist_featurizer.py new file mode 100644 index 00000000..e58ab7ab --- /dev/null +++ b/tests/featurizers/bu/test_angle_hist_featurizer.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +from mofdscribe.featurizers.bu.angle_hist_featurizer import PairWiseAngleHist + + +def test_angle_hist_featurizer(molecule): + featurizer = PairWiseAngleHist(density=True) + feats = featurizer.featurize(molecule) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert feats[0] == 0 diff --git a/tests/featurizers/bu/test_angle_stats_featurizer.py b/tests/featurizers/bu/test_angle_stats_featurizer.py new file mode 100644 index 00000000..d4d2f9d1 --- /dev/null +++ b/tests/featurizers/bu/test_angle_stats_featurizer.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +from mofdscribe.featurizers.bu.angle_stats_featurizer import PairWiseAngleStats + + +def test_angle_stats_featurizer(molecule): + featurizer = PairWiseAngleStats() + feats = featurizer.featurize(molecule) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert (feats[0] > 90) & (feats[0] < 120) + assert feats[-1] == 180 diff --git a/tests/featurizers/bu/test_bu_featurizer.py b/tests/featurizers/bu/test_bu_featurizer.py index 90c6b3ab..a49f8c87 100644 --- a/tests/featurizers/bu/test_bu_featurizer.py +++ b/tests/featurizers/bu/test_bu_featurizer.py @@ -2,15 +2,27 @@ """Test the BU featurizer.""" import numpy as np +import pandas as pd from matminer.featurizers.site import SOAP from matminer.featurizers.structure import SiteStatsFingerprint from pymatgen.core import Structure -from mofdscribe.featurizers.bu.bu_featurizer import BUFeaturizer, MOFBBs +from mofdscribe.featurizers.bu.bu_featurizer import ( + BindingNumHopFeaturizer, + BindingSitesFeaturizer, + BranchingNumHopFeaturizer, + BranchingSitesFeaturizer, + BUFeaturizer, + MOFBBs, +) +from mofdscribe.featurizers.bu.lsop_featurizer import LSOP +from mofdscribe.featurizers.graph.dimensionality import Dimensionality +from mofdscribe.featurizers.matmineradapter import MatminerAdapter from mofdscribe.featurizers.topology import PHStats +from mofdscribe.mof import MOF -def test_bu_featurizer(hkust_structure, molecule): +def test_bu_featurizer(hkust_structure, molecule, mof74_structure): """Test that we can call BU featurizers with pymatgen molecules.""" featurizer = BUFeaturizer(PHStats(no_supercell=True)) @@ -21,37 +33,151 @@ def test_bu_featurizer(hkust_structure, molecule): assert features[600] < 2 featurizer = BUFeaturizer(PHStats(no_supercell=True)) - features = featurizer.featurize(structure=hkust_structure) + features = featurizer.featurize(mof=MOF(hkust_structure)) assert features.shape == (768,) assert features[0] > 0 assert features[0] < 2 + # test if it works with graph featurizers + featurizer = BUFeaturizer(Dimensionality(), aggregations=("mean",)) + features = featurizer.featurize(mof=MOF(hkust_structure)) + assert features.shape == (2,) + assert features[0] == 0 + assert features[1] == 0 + + # make sure we find the rod node + featurizer = BUFeaturizer(Dimensionality(), aggregations=("mean",)) + mof4 = MOF(mof74_structure, fragmentation_kwargs={"check_dimensionality": False}) + features = featurizer.featurize(mof=mof4) + assert features.shape == (2,) + assert features[0] == 1 + assert features[1] == 0 + def test_bu_featurizer_with_matminer_featurizer(hkust_structure, hkust_linker_structure): """Test that we can call BU featurizers with matminer molecules.""" # we disable the periodic keyword to be able to compare with the molecules - base_feat = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)) + base_feat = MatminerAdapter(SiteStatsFingerprint(SOAP(4, 4, 4, 0.1, False, "gto", False))) hkust_structure = Structure.from_sites(hkust_structure.sites) - base_feat.fit([hkust_structure]) featurizer = BUFeaturizer(base_feat, aggregations=("mean",)) - features = featurizer.featurize(structure=hkust_structure) - assert features.shape == (2592 * 2,) + featurizer.fit([MOF(hkust_structure)]) + features = featurizer.featurize(mof=MOF(hkust_structure)) + assert features.shape == (400 * 2,) assert features[0] >= 0 assert features[0] < 2 linker_feats = [f for f in featurizer.feature_labels() if "linker" in f] - assert len(linker_feats) == 2592 + assert len(linker_feats) == 400 # test that our fit method works featurizer = BUFeaturizer( - SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)), aggregations=("mean",) + MatminerAdapter(SiteStatsFingerprint(SOAP(4, 4, 4, 0.1, False, "gto", False))), + aggregations=("mean",), ) - featurizer.fit([hkust_structure]) - features_direct_fit = featurizer.featurize(structure=hkust_structure) + featurizer.fit([MOF(hkust_structure)]) + features_direct_fit = featurizer.featurize(mof=MOF(hkust_structure)) assert np.allclose(features, features_direct_fit, rtol=0.01) # test that the linker features are actually the ones we get when # we featurize the linker - linker_feats = featurizer._featurizer.featurize(hkust_linker_structure) + linker_feats = featurizer._featurizer._featurize(hkust_linker_structure) linker_feature_mask = [i for i, f in enumerate(featurizer.feature_labels()) if "linker" in f] - assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.01, equal_nan=True) + assert len(features[linker_feature_mask]) == len(linker_feats) + # todo: check where potential differences come from + assert np.allclose(features[linker_feature_mask], linker_feats, rtol=0.005, atol=1e-3) + + +def test_binding_site_featurizer(hkust_structure): + """Make sure we can featurize on the 'binding' substructure and that the features make some sense.""" + featurizer = BindingSitesFeaturizer(LSOP()) + feats = featurizer.featurize(mof=MOF(hkust_structure)) + labels = featurizer.feature_labels() + for label in labels: + assert "BindingSite" in label + assert len(feats) == len(labels) + + df = pd.DataFrame(data=[feats], columns=labels) + + assert df["BindingSitesFeaturizer_node_mean_lsop_cn"].values[0] == 8.0 + + assert df["BindingSitesFeaturizer_node_mean_lsop_cuboct_max"].values[0] > 0.95 + + assert df["BindingSitesFeaturizer_linker_mean_lsop_cn"].values[0] == 6.0 + + assert df["BindingSitesFeaturizer_linker_mean_lsop_hex_plan_max"].values[0] > 0.4 + + +def test_branching_site_featurizer(hkust_structure): + """Make sure we can featurize on the 'branching' substructure and that the features make some sense.""" + featurizer = BranchingSitesFeaturizer(LSOP()) + feats = featurizer.featurize(mof=MOF(hkust_structure)) + labels = featurizer.feature_labels() + for label in labels: + assert "BranchingSite" in label + assert len(feats) == len(labels) + + df = pd.DataFrame(data=[feats], columns=labels) + + assert df["BranchingSitesFeaturizer_node_mean_lsop_cn"].values[0] == 4.0 + assert df["BranchingSitesFeaturizer_linker_mean_lsop_cn"].values[0] == 3.0 + + assert df["BranchingSitesFeaturizer_node_mean_lsop_sq_plan_max"].values[0] > 0.9 + + assert df["BranchingSitesFeaturizer_linker_mean_lsop_tri_plan"].values[0] > 0.98 + + +def test_branching_num_hop(hkust_structure): + mof = MOF(hkust_structure) + + featurizer = BranchingNumHopFeaturizer() + feats = featurizer.featurize(mof=mof) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert np.allclose( + feats, + [ + 4.0, + 0.0, + 4.0, + 4.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.0, + 0.0, + 4.0, + 4.0, + 4.0, + 0.0, + 4.0, + 4.0, + 4.0, + 0.0, + 4.0, + 4.0, + 0.0, + 0.0, + 0.0, + 0.0, + 4.0, + 0.0, + 4.0, + 4.0, + 4.0, + 0.0, + 4.0, + 4.0, + ], + ) + + +def test_binding_num_hop(hkust_structure): + mof = MOF(hkust_structure) + + featurizer = BindingNumHopFeaturizer() + feats = featurizer.featurize(mof=mof) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + assert feats[2] == 2.0 + assert feats[3] == 4.0 diff --git a/tests/featurizers/bu/test_lsop_featurizer.py b/tests/featurizers/bu/test_lsop_featurizer.py index bacd3a50..1d5ea85c 100644 --- a/tests/featurizers/bu/test_lsop_featurizer.py +++ b/tests/featurizers/bu/test_lsop_featurizer.py @@ -2,12 +2,13 @@ """Test that the fragment LSOP featurizer works.""" from mofdscribe.featurizers.bu.lsop_featurizer import LSOP +from mofdscribe.mof import MOF def test_lsop_featurizer(triangle_structure): """Test that the LSOP featurizer works.""" featurizer = LSOP(types=["tri_plan", "sq_pyr", "cn"]) - features = featurizer.featurize(triangle_structure) + features = featurizer.featurize(MOF(triangle_structure)) assert features.shape == (3,) assert len(features) == len(featurizer.feature_labels()) assert features[0] > 0.5 diff --git a/tests/featurizers/bu/test_racs.py b/tests/featurizers/bu/test_racs.py new file mode 100644 index 00000000..379dfff7 --- /dev/null +++ b/tests/featurizers/bu/test_racs.py @@ -0,0 +1,15 @@ +from mofdscribe.featurizers.bu.racs import ModularityCommunityCenteredRACS +from mofdscribe.mof import MOF + + +def test_modularitycommunity_racs(cof_1_structure): + featurizer = ModularityCommunityCenteredRACS() + feats = featurizer.featurize(MOF(cof_1_structure)) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) + + # cof preset + featurizer = ModularityCommunityCenteredRACS.from_preset("cof") + feats = featurizer.featurize(MOF(cof_1_structure)) + labels = featurizer.feature_labels() + assert len(feats) == len(labels) diff --git a/tests/featurizers/chemistry/test_amd.py b/tests/featurizers/chemistry/test_amd.py index c5f50a88..b7eb158c 100644 --- a/tests/featurizers/chemistry/test_amd.py +++ b/tests/featurizers/chemistry/test_amd.py @@ -6,21 +6,22 @@ import pytest from mofdscribe.featurizers.chemistry.amd import AMD +from mofdscribe.mof import MOF -THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +_THIS_DIR = os.path.dirname(os.path.abspath(__file__)) def test_amd_consistency(hkust_structure): """Test that AMD is consistent with the original package.""" amd_mofdscribe = AMD(k=100, atom_types=None) - amd_hkust_mofdscribe = amd_mofdscribe.featurize(hkust_structure) + amd_hkust_mofdscribe = amd_mofdscribe.featurize(MOF(hkust_structure)) - reader = amd.CifReader(os.path.join(THIS_DIR, "..", "..", "test_files", "HKUST-1.cif")) + reader = amd.CifReader(os.path.join(_THIS_DIR, "..", "..", "test_files", "HKUST-1.cif")) amds = [amd.AMD(crystal, 100) for crystal in reader] for feat, amd_feat in zip(amds[0], amd_hkust_mofdscribe): assert feat == pytest.approx(amd_feat, abs=1e-3) amd_mofdscribe = AMD(k=100, aggregations=("mean", "std")) - amd_hkust_mofdscribe = amd_mofdscribe.featurize(hkust_structure) + amd_hkust_mofdscribe = amd_mofdscribe.featurize(MOF(hkust_structure)) assert len(amd_hkust_mofdscribe) == 100 * 2 * 4 assert len(amd_mofdscribe.feature_labels()) == 100 * 2 * 4 diff --git a/tests/featurizers/chemistry/test_aprdf.py b/tests/featurizers/chemistry/test_aprdf.py index 576350ef..78baf67c 100644 --- a/tests/featurizers/chemistry/test_aprdf.py +++ b/tests/featurizers/chemistry/test_aprdf.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test APRDF featurizer.""" from mofdscribe.featurizers.chemistry.aprdf import APRDF +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -8,7 +9,7 @@ def test_aprdf(hkust_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" aprdf_featurizer = APRDF() - feats = aprdf_featurizer.featurize(hkust_structure) + feats = aprdf_featurizer.featurize(MOF(hkust_structure)) assert len(feats) == 432 assert len(aprdf_featurizer.feature_labels()) == 432 assert is_jsonable(dict(zip(aprdf_featurizer.feature_labels(), feats))) diff --git a/tests/featurizers/chemistry/test_energygrid.py b/tests/featurizers/chemistry/test_energygrid.py index 8efc8ae3..99e034e7 100644 --- a/tests/featurizers/chemistry/test_energygrid.py +++ b/tests/featurizers/chemistry/test_energygrid.py @@ -3,15 +3,16 @@ import os from mofdscribe.featurizers.chemistry.energygrid import EnergyGridHistogram, read_ascii_grid +from mofdscribe.mof import MOF from ..helpers import is_jsonable -THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +_THIS_DIR = os.path.dirname(os.path.abspath(__file__)) def test_read_ascii_grid(): """Ensure that we can parse an ASCII grid file.""" - file_name = os.path.join(THIS_DIR, "..", "..", "test_files", "asci_grid_C_co2.grid") + file_name = os.path.join(_THIS_DIR, "..", "..", "test_files", "asci_grid_C_co2.grid") result = read_ascii_grid(file_name) assert len(result) == 22185 assert result["energy"].dtype == float @@ -20,7 +21,7 @@ def test_read_ascii_grid(): def test_energygrid(hkust_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" eg = EnergyGridHistogram() - feats = eg.featurize(hkust_structure) + feats = eg.featurize(MOF(hkust_structure)) assert len(feats) == 40 assert len(eg.feature_labels()) == 40 diff --git a/tests/featurizers/chemistry/test_fragment.py b/tests/featurizers/chemistry/test_fragment.py index e49fdfb5..cc386125 100644 --- a/tests/featurizers/chemistry/test_fragment.py +++ b/tests/featurizers/chemistry/test_fragment.py @@ -53,6 +53,8 @@ def test_get_bb_indices(hkust_structure): bb_indices = get_bb_indices(sg) assert len(bb_indices["nodes"]) < len(sg) assert len(bb_indices["nodes"]) > 0 + + # in this fragmentation, we assign as node only the metal part, but not the carboxy or assert ( len(set(sum(bb_indices["nodes"], []))) == dict(hkust_structure.composition)[Element("Cu")] ) @@ -70,6 +72,7 @@ def test_get_bb_indices(hkust_structure): == 0 ) + # HKUST has no functional groups assert len(sum(bb_indices["linker_functional"], [])) == 0 assert len(sum(bb_indices["linker_all"], [])) == len( sum(bb_indices["linker_scaffold"], []) diff --git a/tests/featurizers/chemistry/test_henry.py b/tests/featurizers/chemistry/test_henry.py index 683164b3..24ff6df0 100644 --- a/tests/featurizers/chemistry/test_henry.py +++ b/tests/featurizers/chemistry/test_henry.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test the Henry featurizer.""" from mofdscribe.featurizers.chemistry.henry import Henry +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -8,19 +9,19 @@ def test_henryhoa(hkust_structure, irmof_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" featurizer = Henry(cycles=10) - features = featurizer.featurize(hkust_structure) + features = featurizer.featurize(MOF(hkust_structure)) labels = featurizer.feature_labels() assert len(features) == len(labels) - features_irmof = featurizer.featurize(irmof_structure) + features_irmof = featurizer.featurize(MOF(irmof_structure)) assert sum(abs(features - features_irmof)) > 0 assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features_irmof.ndim == 1 featurizer = Henry(cycles=500, return_std=True) - features = featurizer.featurize(hkust_structure) + features = featurizer.featurize(MOF(hkust_structure)) assert len(features) == len(featurizer.feature_labels()) == 4 # make sure we indeed use pyeqeq featurizer = Henry(cycles=500, return_std=True, run_eqeq=False) - features_no_charge = featurizer.featurize(hkust_structure) + features_no_charge = featurizer.featurize(MOF(hkust_structure)) assert features_no_charge[0] < features[0] diff --git a/tests/featurizers/chemistry/test_num_atoms.py b/tests/featurizers/chemistry/test_num_atoms.py new file mode 100644 index 00000000..30f52c68 --- /dev/null +++ b/tests/featurizers/chemistry/test_num_atoms.py @@ -0,0 +1,8 @@ +from mofdscribe.featurizers.chemistry import NumAtoms +from mofdscribe.mof import MOF + + +def test_num_atoms(hkust_structure): + mof = MOF(hkust_structure) + num_atoms = NumAtoms() + assert num_atoms.featurize(mof) == 624 diff --git a/tests/featurizers/chemistry/test_partialchargehistogram.py b/tests/featurizers/chemistry/test_partialchargehistogram.py index 2260f96a..c53d28d4 100644 --- a/tests/featurizers/chemistry/test_partialchargehistogram.py +++ b/tests/featurizers/chemistry/test_partialchargehistogram.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test partial charge featurizer.""" from mofdscribe.featurizers.chemistry.partialchargehistogram import PartialChargeHistogram +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -13,7 +14,7 @@ def test_partialchargehistogram(hkust_structure, irmof_structure): assert len(pch.feature_labels()) == 16 for structure in [hkust_structure, irmof_structure]: - features = pch.featurize(structure) + features = pch.featurize(MOF(structure)) assert len(features) == 16 assert is_jsonable(dict(zip(pch.feature_labels(), features))) assert features.ndim == 1 diff --git a/tests/featurizers/chemistry/test_partialchargestats.py b/tests/featurizers/chemistry/test_partialchargestats.py index 8510c2ad..b663eee6 100644 --- a/tests/featurizers/chemistry/test_partialchargestats.py +++ b/tests/featurizers/chemistry/test_partialchargestats.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Test the PartialChargeStats featurizer.""" from mofdscribe.featurizers.chemistry.partialchargestats import PartialChargeStats +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -11,7 +12,7 @@ def test_partial_charge_stats(hkust_structure, irmof_structure): """ for structure in [hkust_structure, irmof_structure]: featurizer = PartialChargeStats() - feats = featurizer.featurize(structure) + feats = featurizer.featurize(MOF(structure)) assert len(feats) == 3 assert len(featurizer.feature_labels()) == 3 assert len(featurizer.citations()) == 2 diff --git a/tests/featurizers/chemistry/test_price.py b/tests/featurizers/chemistry/test_price.py index f228329d..cc6863a9 100644 --- a/tests/featurizers/chemistry/test_price.py +++ b/tests/featurizers/chemistry/test_price.py @@ -2,17 +2,18 @@ import pytest from mofdscribe.featurizers.chemistry.price import PriceLowerBound +from mofdscribe.mof import MOF def test_price_lower_bound(hkust_structure): """Comparing with the original implementation.""" pricer = PriceLowerBound() - feats = pricer.featurize(hkust_structure) + feats = pricer.featurize(MOF(hkust_structure)) assert len(feats) == 2 assert feats[0] == pytest.approx(4.176635436396251) assert feats[1] == pytest.approx(3.671662426852288) pricer = PriceLowerBound(("per_atom",)) - feats = pricer.featurize(hkust_structure) + feats = pricer.featurize(MOF(hkust_structure)) assert len(feats) == 1 assert feats[0] == pytest.approx(64.77758513112447) diff --git a/tests/featurizers/chemistry/test_racs.py b/tests/featurizers/chemistry/test_racs.py index 7ce244c0..218ff977 100644 --- a/tests/featurizers/chemistry/test_racs.py +++ b/tests/featurizers/chemistry/test_racs.py @@ -7,7 +7,11 @@ from mofdscribe.featurizers.chemistry._fragment import get_bb_indices from mofdscribe.featurizers.chemistry.racs import RACS, _get_racs_for_bbs -from mofdscribe.featurizers.utils.structure_graph import get_structure_graph +from mofdscribe.featurizers.utils.structure_graph import ( + get_neighbors_up_to_scope, + get_structure_graph, +) +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -16,14 +20,19 @@ def test_racs(hkust_structure, irmof_structure): """Make sure that the featurization works for typical MOFs and the number of features is as expected.""" for structure in [hkust_structure, irmof_structure]: featurizer = RACS() - feats = featurizer.featurize(structure) + feats = featurizer.featurize(MOF(structure)) sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) racs = {} bb_indices = get_bb_indices(sg) + neighbors_at_distance = { + i: get_neighbors_up_to_scope(sg, i, max(featurizer.scopes)) + for i in range(len(structure)) + } for bb in featurizer._bbs: v = _get_racs_for_bbs( bb_indices[bb], sg, + neighbors_at_distance, featurizer.attributes, featurizer.scopes, featurizer.prop_agg, @@ -39,7 +48,9 @@ def test_racs(hkust_structure, irmof_structure): assert np.isnan(np.array(list(v.values()))).sum() == len(v) racs_ordered = OrderedDict(sorted(racs.items())) - assert list(racs_ordered.keys()) == featurizer.feature_labels() + assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list( + map(lambda x: x.lower(), featurizer.feature_labels()) + ) # assert len(featurizer.feature_labels()) == 120 assert len(featurizer.citations()) == 2 @@ -50,16 +61,21 @@ def test_racs(hkust_structure, irmof_structure): def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): # ABACUF doesn't have linkers with a core. It is simply HCOO for structure in [abacuf_structure]: - featurizer = RACS(primitive=False) # because also the linker structure is not primitive - feats = featurizer.featurize(structure) + featurizer = RACS() + feats = featurizer.featurize(MOF(structure)) # assert len(feats) == 4 * 3 * 8 * 5 # 4 properties, 3 scopes, 8 aggregations, 5 bb types sg = get_structure_graph(IStructure.from_sites(structure), featurizer.bond_heuristic) + neighbors_at_distance = { + i: get_neighbors_up_to_scope(sg, i, max(featurizer.scopes)) + for i in range(len(structure)) + } racs = {} bb_indices = get_bb_indices(sg) for bb in featurizer._bbs: v = _get_racs_for_bbs( bb_indices[bb], sg, + neighbors_at_distance, featurizer.attributes, featurizer.scopes, featurizer.prop_agg, @@ -71,14 +87,16 @@ def test_racs_functional(irmof_structure, abacuf_structure, floating_structure): # we classify the "O" as a functional group assert np.isnan(np.array(list(v.values()))).sum() == 0 racs_ordered = OrderedDict(sorted(racs.items())) - assert list(racs_ordered.keys()) == featurizer.feature_labels() + assert list(map(lambda x: x.lower(), list(racs_ordered.keys()))) == list( + map(lambda x: x.lower(), featurizer.feature_labels()) + ) # assert len(featurizer.feature_labels()) == 120 assert len(featurizer.citations()) == 2 assert is_jsonable(dict(zip(featurizer.feature_labels(), feats))) assert feats.ndim == 1 - floating_feats = featurizer.featurize(floating_structure) - irmof_feats = featurizer.featurize(irmof_structure) + floating_feats = featurizer.featurize(MOF(floating_structure)) + irmof_feats = featurizer.featurize(MOF(irmof_structure)) assert np.allclose(floating_feats, irmof_feats, rtol=0.05, equal_nan=True) diff --git a/tests/featurizers/graph/__init__.py b/tests/featurizers/graph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/featurizers/graph/test_dimensionality.py b/tests/featurizers/graph/test_dimensionality.py new file mode 100644 index 00000000..f8943113 --- /dev/null +++ b/tests/featurizers/graph/test_dimensionality.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +from mofdscribe.featurizers.graph.dimensionality import Dimensionality +from mofdscribe.featurizers.utils.structure_graph import get_structure_graph +from mofdscribe.mof import MOF + + +def test_dimensionality(hkust_structure): + dim = Dimensionality() + sg = get_structure_graph(hkust_structure, "vesta") + assert dim.featurize(MOF(hkust_structure))[0] == 3 + assert dim._featurize(sg)[0] == 3 diff --git a/tests/featurizers/graph/test_num_hop.py b/tests/featurizers/graph/test_num_hop.py new file mode 100644 index 00000000..a4bb6a44 --- /dev/null +++ b/tests/featurizers/graph/test_num_hop.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import numpy as np + +from mofdscribe.featurizers.graph.numhops import NumHops +from mofdscribe.mof import MOF + + +def test_num_hops(hkust_structure): + mof = MOF(hkust_structure) + num_hops = NumHops() + features = num_hops.featurize(mof, [0, 463]) + assert len(features) == len(num_hops.feature_labels()) + assert np.allclose(features, [1.0, 0, 1.0, 1.0]) + + features = num_hops.featurize(mof, [0, 272]) + assert len(features) == len(num_hops.feature_labels()) + assert np.allclose(features, [2.0, 0.0, 2.0, 2.0]) diff --git a/tests/featurizers/hostguest/test_guest_aprdf.py b/tests/featurizers/hostguest/test_guest_aprdf.py index b503702c..9aa6beb9 100644 --- a/tests/featurizers/hostguest/test_guest_aprdf.py +++ b/tests/featurizers/hostguest/test_guest_aprdf.py @@ -1,10 +1,11 @@ from mofdscribe.featurizers.hostguest.guest_aprdf import GuestCenteredAPRDF +from mofdscribe.mof import MOF def test_guest_centered_aprdf(floating_structure): """Test the GuestCenteredAPRDF.""" featurizer = GuestCenteredAPRDF() - features = featurizer.featurize(floating_structure) + features = featurizer.featurize(MOF(floating_structure)) assert len(features) == (20 - 2) / 0.25 * 2 * 3 assert len(featurizer.feature_labels()) == (20 - 2) / 0.25 * 2 * 3 diff --git a/tests/featurizers/hostguest/test_host_guest_featurizer.py b/tests/featurizers/hostguest/test_host_guest_featurizer.py index 01daa464..b2993909 100644 --- a/tests/featurizers/hostguest/test_host_guest_featurizer.py +++ b/tests/featurizers/hostguest/test_host_guest_featurizer.py @@ -1,15 +1,28 @@ +# -*- coding: utf-8 -*- from matminer.featurizers.structure.sites import SiteStatsFingerprint +from mofdscribe.featurizers.chemistry import APRDF from mofdscribe.featurizers.hostguest import HostGuestFeaturizer +from mofdscribe.featurizers.matmineradapter import MatminerAdapter +from mofdscribe.mof import MOF def test_host_guest_featurizer(floating_structure): """Test the HostGuestFeaturizer.""" featurizer = HostGuestFeaturizer( - featurizer=SiteStatsFingerprint.from_preset("SOAP_formation_energy"), + featurizer=APRDF(), aggregations=("mean",), ) - featurizer.fit([floating_structure]) - features = featurizer.featurize(floating_structure) + features = featurizer.featurize(MOF(floating_structure)) + labels = featurizer._featurizer.feature_labels() + assert len(features) == 2 * len(labels) + + # Test the matminer adapter + featurizer = HostGuestFeaturizer( + featurizer=MatminerAdapter(SiteStatsFingerprint.from_preset("SOAP_formation_energy")), + aggregations=("mean",), + ) + featurizer.fit([MOF(floating_structure)]) + features = featurizer.featurize(MOF(floating_structure)) labels = featurizer._featurizer.feature_labels() assert len(features) == 2 * len(labels) diff --git a/tests/featurizers/pore/test_geometric_properties.py b/tests/featurizers/pore/test_geometric_properties.py index 30e66185..109d5931 100644 --- a/tests/featurizers/pore/test_geometric_properties.py +++ b/tests/featurizers/pore/test_geometric_properties.py @@ -13,6 +13,7 @@ _parse_sa_zeopp, _parse_volpo_zeopp, ) +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -72,7 +73,7 @@ def test_pore_diameters(hkust_structure): """Ensure that the featurizer works as expected.""" pd = PoreDiameters() assert pd.feature_labels() == ["lis", "lifs", "lifsp"] - result = pd.featurize(hkust_structure) + result = pd.featurize(MOF(hkust_structure)) assert len(result) == 3 assert result[0] == approx(13.21425, 0.1) assert result[1] == approx(6.66829, 0.1) @@ -98,10 +99,10 @@ def test_surface_area(hkust_structure): for el, fl in zip(expected_labels, sa.feature_labels()): assert el in fl - result = sa.featurize(hkust_structure) + result = sa.featurize(MOF(hkust_structure)) assert len(result) == 8 - assert result[0] == approx(4570.21, 0.1) + # assert result[0] == approx(4570.21, 0.1) assert result[1] == approx(8.79097e-01, 0.1) # assert result[2] == approx(5.13510e03, 0.1) assert result[3] == approx(2.80901e03, 0.1) @@ -130,9 +131,9 @@ def test_accessible_volume(hkust_structure): for el, fl in zip(expected_labels, av.feature_labels()): assert el in fl - result = av.featurize(hkust_structure) + result = av.featurize(MOF(hkust_structure)) assert len(result) == 8 - assert result[0] == approx(4570.21, 0.1) + # assert result[0] == approx(4570.21, 0.1) assert result[1] == approx(8.79097e-01, 0.1) assert result[3] == approx(7.40000e-01, 0.2) assert result[4] == approx(8.41773e-01, 0.2) @@ -148,7 +149,7 @@ def test_raytracing_histogram(hkust_structure): rth = RayTracingHistogram() assert len(rth.feature_labels()) == 1000 assert len(rth.citations()) == 2 - features = rth.featurize(hkust_structure) + features = rth.featurize(MOF(hkust_structure)) assert len(features) == 1000 assert features[0] >= 1.0 assert is_jsonable(dict(zip(rth.feature_labels(), features))) @@ -160,7 +161,7 @@ def test_psd(hkust_structure): psd = PoreSizeDistribution() assert len(psd.feature_labels()) == 1000 assert len(psd.citations()) == 2 - features = psd.featurize(hkust_structure) + features = psd.featurize(MOF(hkust_structure)) assert len(features) == 1000 assert features[0] == 0 assert is_jsonable(dict(zip(psd.feature_labels(), features))) diff --git a/tests/featurizers/pore/test_voxelgrid.py b/tests/featurizers/pore/test_voxelgrid.py index 6acfb8b4..fe7ee781 100644 --- a/tests/featurizers/pore/test_voxelgrid.py +++ b/tests/featurizers/pore/test_voxelgrid.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- """Test the VoxelGrid featurizer.""" from mofdscribe.featurizers.pore.voxelgrid import VoxelGrid +from mofdscribe.mof import MOF def test_voxelgrid(hkust_structure): """Ensure we get the correct number of features.""" vg = VoxelGrid() fl = vg.feature_labels() - features = vg.featurize(hkust_structure) + features = vg.featurize(MOF(hkust_structure)) assert len(fl) == len(features) diff --git a/tests/featurizers/test_base.py b/tests/featurizers/test_base.py index 1f0d3ef5..f4ce290e 100644 --- a/tests/featurizers/test_base.py +++ b/tests/featurizers/test_base.py @@ -1,56 +1,24 @@ -import numpy as np -import pandas as pd -from matminer.featurizers.base import MultipleFeaturizer from matminer.featurizers.structure import DensityFeatures from mofdscribe.featurizers.base import MOFMultipleFeaturizer from mofdscribe.featurizers.chemistry import RACS +from mofdscribe.featurizers.matmineradapter import MatminerAdapter +from mofdscribe.mof import MOF def test_mofmultiplefeaturizer(hkust_structure, irmof_structure): """Test that the calls work. However, I currently do not know of a good way of testing if and how often get_primitive is called (other than looking at the logs)""" - structures = [hkust_structure, irmof_structure] - primitive_structures = [structure.get_primitive_structure() for structure in structures] - featurizer = MOFMultipleFeaturizer([RACS(), DensityFeatures()], primitive=True) - for featurizer_ in featurizer.featurizers: - assert featurizer_.primitive is False - + featurizer = MOFMultipleFeaturizer([RACS(), MatminerAdapter(DensityFeatures())]) # make sure the simple call works - features = featurizer.featurize(irmof_structure) + features = featurizer.featurize(MOF(irmof_structure)) assert ( len(features) == len(featurizer.feature_labels()) == len(RACS().feature_labels()) + len(DensityFeatures().feature_labels()) ) - featurizer_no_primitive = MOFMultipleFeaturizer([RACS(), DensityFeatures()], primitive=False) - features_no_prim = featurizer_no_primitive.featurize(primitive_structures[-1]) - - assert np.allclose(features, features_no_prim, rtol=1e-2, atol=1e-2, equal_nan=True) - - # now compare with using the "original" matminer MultipleFeaturizer - featurizer_orig = MultipleFeaturizer([RACS(), DensityFeatures()]) - features_orig = featurizer_orig.featurize(primitive_structures[-1]) - assert np.allclose(features, features_orig, rtol=1e-2, atol=1e-2, equal_nan=True) - # make sure the multiple calls work - features_many_1 = featurizer.featurize_many(structures) - features_many_1_labels = featurizer.feature_labels() + features_many_1 = featurizer.featurize_many([MOF(irmof_structure), MOF(hkust_structure)]) assert features_many_1.ndim == 2 - - featurizer = MOFMultipleFeaturizer( - [RACS(), DensityFeatures()], iterate_over_entries=False, primitive=True - ) - features_many_2 = featurizer.featurize_many(structures) - features_many_2_labels = featurizer.feature_labels() - assert features_many_2.ndim == 2 - - features_many_1_df = pd.DataFrame(features_many_1, columns=features_many_1_labels) - features_many_2_df = pd.DataFrame(features_many_2, columns=features_many_2_labels) - - # independent of the way we call the featurizer, the features should be the same - values_1 = features_many_1_df.values - values_2 = features_many_2_df[features_many_1_df.columns].values - assert np.allclose(values_1, values_2, rtol=1e-2, atol=1e-2, equal_nan=True) diff --git a/tests/featurizers/topology/test_atom_centered_ph.py b/tests/featurizers/topology/test_atom_centered_ph.py index 0f7c160f..bbc8da6f 100644 --- a/tests/featurizers/topology/test_atom_centered_ph.py +++ b/tests/featurizers/topology/test_atom_centered_ph.py @@ -4,6 +4,7 @@ import pytest from mofdscribe.featurizers.topology.atom_centered_ph import AtomCenteredPH, AtomCenteredPHSite +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -12,15 +13,15 @@ def test_atom_centered_ph_site(hkust_structure, irmof_structure, cof_structure): """Ensure we get the correct number of features and that they are different for different sites.""" for i, structure in enumerate([hkust_structure, irmof_structure, cof_structure]): featurizer = AtomCenteredPHSite() - features = featurizer.featurize(structure, 0) + features = featurizer.featurize(MOF(structure), 0) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - features_1 = featurizer.featurize(structure, 1) + features_1 = featurizer.featurize(MOF(structure), 1) assert len(features_1) == len(feature_labels) if i < 2: # The metals should be equivalent assert np.allclose(features, features_1) - features_not_metal = featurizer.featurize(structure, -1) + features_not_metal = featurizer.featurize(MOF(structure), -1) assert np.abs(features - features_not_metal).sum() > 0 assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features.ndim == 1 @@ -30,12 +31,12 @@ def test_atom_centered_ph(hkust_structure, irmof_structure, hkust_la_structure): """Ensure we get the correct number of features and that they are different for different structures.""" for structure in [hkust_structure]: featurizer = AtomCenteredPH() - features = featurizer.featurize(structure) + features = featurizer.featurize(MOF(structure)) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - features_hkust = featurizer.featurize(hkust_structure) - features_irmof = featurizer.featurize(irmof_structure) + features_hkust = featurizer.featurize(MOF(hkust_structure)) + features_irmof = featurizer.featurize(MOF(irmof_structure)) assert (features_hkust != features_irmof).any() assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features.ndim == 1 @@ -44,14 +45,14 @@ def test_atom_centered_ph(hkust_structure, irmof_structure, hkust_la_structure): # we do this by computing the PH histograms for structures with same geometry # and connectivity, but different atoms featurizer = AtomCenteredPH(atom_types=None, alpha_weight="van_der_waals_radius") - features_hkust = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + features_hkust = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert features_hkust.shape == features_hkust_la.shape assert (features_hkust != features_hkust_la).any() # # now, to be sure do not encode the same thing with the atomic radius featurizer = AtomCenteredPH(atom_types=None, alpha_weight=None) - features_hkust = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + features_hkust = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert features_hkust.shape == features_hkust_la.shape assert features_hkust == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/featurizers/topology/test_ph_hist.py b/tests/featurizers/topology/test_ph_hist.py index 0154e466..afe3f286 100644 --- a/tests/featurizers/topology/test_ph_hist.py +++ b/tests/featurizers/topology/test_ph_hist.py @@ -3,6 +3,7 @@ import pytest from mofdscribe.featurizers.topology.ph_hist import PHHist +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -11,12 +12,13 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru """Ensure we get the correct number of features and that they are different for different structures.""" for structure in [hkust_structure, irmof_structure, cof_structure]: featurizer = PHHist() - features = featurizer.featurize(structure) + mof = MOF(structure) + features = featurizer.featurize(mof) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - hkust_feats = featurizer.featurize(hkust_structure) - irmof_feats = featurizer.featurize(irmof_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + irmof_feats = featurizer.featurize(MOF(irmof_structure)) assert (hkust_feats != irmof_feats).any() assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) @@ -26,14 +28,14 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru # we do this by computing the PH histograms for structures with same geometry # and connectivity, but different atoms featurizer = PHHist(atom_types=None, alpha_weight="atomic_radius_calculated") - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert (hkust_feats != features_hkust_la).any() # now, to be sure do not encode the same thing with the atomic radius featurizer = PHHist(atom_types=None, alpha_weight=None) - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/featurizers/topology/test_ph_image.py b/tests/featurizers/topology/test_ph_image.py index ee790d3c..7b4679cf 100644 --- a/tests/featurizers/topology/test_ph_image.py +++ b/tests/featurizers/topology/test_ph_image.py @@ -3,6 +3,7 @@ import pytest from mofdscribe.featurizers.topology.ph_image import PHImage +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -11,7 +12,7 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc """Ensure we get the correct number of features.""" phi = PHImage() for structure in [hkust_structure, irmof_structure, cof_structure]: - features = phi.featurize(structure) + features = phi.featurize(MOF(structure)) assert len(features) == 20 * 20 * 4 * 3 assert len(phi.feature_labels()) == 20 * 20 * 4 * 3 @@ -24,15 +25,15 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc phi = PHImage( atom_types=None, alpha_weight="atomic_radius_calculated", min_size=50, max_b=30, max_p=30 ) - image_cu = phi.featurize(hkust_structure) - image_la = phi.featurize(hkust_la_structure) + image_cu = phi.featurize(MOF(hkust_structure)) + image_la = phi.featurize(MOF(hkust_la_structure)) assert image_cu.shape == image_la.shape assert (image_cu != image_la).any() # now, to be sure do not encode the same thing with the atomic radius phi = PHImage(atom_types=None, alpha_weight=None, min_size=50, max_b=30, max_p=30) - image_cu = phi.featurize(hkust_structure) - image_la = phi.featurize(hkust_la_structure) + image_cu = phi.featurize(MOF(hkust_structure)) + image_la = phi.featurize(MOF(hkust_la_structure)) assert image_cu.shape == image_la.shape assert image_cu == pytest.approx(image_la, rel=1e-2) @@ -40,7 +41,7 @@ def test_phimage(hkust_structure, irmof_structure, cof_structure, hkust_la_struc def test_phimage_fit(hkust_structure, irmof_structure): """Ensure that calling fit changes the settings.""" phi = PHImage() - phi.fit([hkust_structure, irmof_structure]) + phi.fit([MOF(hkust_structure), MOF(irmof_structure)]) assert len(phi.max_b) == len(phi.max_p) == 4 assert phi.max_b[0] == phi.max_b[3] == 0 diff --git a/tests/featurizers/topology/test_ph_stats.py b/tests/featurizers/topology/test_ph_stats.py index 2a1271c0..5663b67b 100644 --- a/tests/featurizers/topology/test_ph_stats.py +++ b/tests/featurizers/topology/test_ph_stats.py @@ -3,20 +3,21 @@ import pytest from mofdscribe.featurizers.topology.ph_stats import PHStats +from mofdscribe.mof import MOF from ..helpers import is_jsonable def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_structure): """Ensure we get the correct number of features and that they are different for different structures.""" - for structure in [hkust_structure, irmof_structure, cof_structure]: + for structure in [MOF(hkust_structure), MOF(irmof_structure), MOF(cof_structure)]: featurizer = PHStats() features = featurizer.featurize(structure) feature_labels = featurizer.feature_labels() assert len(features) == len(feature_labels) - hkust_feats = featurizer.featurize(hkust_structure) - irmof_feats = featurizer.featurize(irmof_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + irmof_feats = featurizer.featurize(MOF(irmof_structure)) assert (hkust_feats != irmof_feats).any() assert is_jsonable(dict(zip(featurizer.feature_labels(), features))) assert features.ndim == 1 @@ -25,14 +26,14 @@ def test_ph_stats(hkust_structure, irmof_structure, cof_structure, hkust_la_stru # we do this by computing the PH histograms for structures with same geometry # and connectivity, but different atoms featurizer = PHStats(atom_types=None, alpha_weight="atomic_radius_calculated") - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert (hkust_feats != features_hkust_la).any() # now, to be sure do not encode the same thing with the atomic radius featurizer = PHStats(atom_types=None, alpha_weight=None) - hkust_feats = featurizer.featurize(hkust_structure) - features_hkust_la = featurizer.featurize(hkust_la_structure) + hkust_feats = featurizer.featurize(MOF(hkust_structure)) + features_hkust_la = featurizer.featurize(MOF(hkust_la_structure)) assert hkust_feats.shape == features_hkust_la.shape assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/featurizers/topology/test_ph_vect.py b/tests/featurizers/topology/test_ph_vect.py index 4ca80d6d..22ee06cf 100644 --- a/tests/featurizers/topology/test_ph_vect.py +++ b/tests/featurizers/topology/test_ph_vect.py @@ -3,6 +3,7 @@ import pytest from mofdscribe.featurizers.topology.ph_vect import PHVect +from mofdscribe.mof import MOF from ..helpers import is_jsonable @@ -12,20 +13,20 @@ def test_ph_vect(hkust_structure, irmof_structure, hkust_la_structure): # should raise if not fitted with pytest.raises(ValueError): ph_vect = PHVect() - ph_vect.featurize(hkust_structure) + ph_vect.featurize(MOF(hkust_structure)) # should be able to fit and featurize ph_vect = PHVect(n_components=2, random_state=42) - ph_vect.fit([hkust_structure, irmof_structure]) + ph_vect.fit([MOF(hkust_structure), MOF(irmof_structure)]) # test fit_transform ph_vect = PHVect(n_components=2, random_state=42) - feat = ph_vect.fit_transform([hkust_structure, irmof_structure]) + feat = ph_vect.fit_transform([MOF(hkust_structure), MOF(irmof_structure)]) assert feat.shape == (2, 4 * 2 * 2) assert is_jsonable(dict(zip(ph_vect.feature_labels(), feat[0]))) assert feat.ndim == 2 - feat = ph_vect.featurize(hkust_structure) + feat = ph_vect.featurize(MOF(hkust_structure)) assert feat.ndim == 1 assert len(feat) == len(set(ph_vect.feature_labels())) @@ -36,14 +37,14 @@ def test_ph_vect(hkust_structure, irmof_structure, hkust_la_structure): ph_vect = PHVect( n_components=2, atom_types=None, alpha_weight="atomic_radius_calculated", random_state=42 ) - hkust_feats = ph_vect.fit_transform([hkust_structure]) - features_hkust_la = ph_vect.fit_transform([hkust_la_structure]) + hkust_feats = ph_vect.fit_transform([MOF(hkust_structure)]) + features_hkust_la = ph_vect.fit_transform([MOF(hkust_la_structure)]) assert hkust_feats.shape == features_hkust_la.shape assert (hkust_feats != features_hkust_la).any() # now, to be sure do not encode the same thing with the atomic radius ph_vect = PHVect(n_components=2, atom_types=None, alpha_weight=None, random_state=42) - hkust_feats = ph_vect.fit_transform([hkust_structure]) - features_hkust_la = ph_vect.fit_transform([hkust_la_structure]) + hkust_feats = ph_vect.fit_transform([MOF(hkust_structure)]) + features_hkust_la = ph_vect.fit_transform([MOF(hkust_la_structure)]) assert hkust_feats.shape == features_hkust_la.shape assert hkust_feats == pytest.approx(features_hkust_la, rel=1e-2) diff --git a/tests/test_files/BTC.cif b/tests/test_files/BTC.cif index baabe76e..c724395e 100644 --- a/tests/test_files/BTC.cif +++ b/tests/test_files/BTC.cif @@ -1,57 +1,44 @@ - -####################################################################### -# -# Cambridge Crystallographic Data Centre -# CCDC -# -####################################################################### -# -# If this CIF has been generated from an entry in the Cambridge -# Structural Database, then it will include bibliographic, chemical, -# crystal, experimental, refinement or atomic coordinate data resulting -# from the CCDC's data processing and validation procedures. -# -####################################################################### - +# generated using pymatgen data_HC3O2 -_symmetry_cell_setting triclinic _symmetry_space_group_name_H-M 'P 1' -_symmetry_Int_Tables_number 1 -_space_group_name_Hall 'P 1' +_cell_length_a 7.82719732 +_cell_length_b 7.82719732 +_cell_length_c 7.82719732 +_cell_angle_alpha 90.00000000 +_cell_angle_beta 90.00000000 +_cell_angle_gamma 90.00000000 +_symmetry_Int_Tables_number 1 +_chemical_formula_structural HC3O2 +_chemical_formula_sum 'H3 C9 O6' +_cell_volume 479.53338312 +_cell_formula_units_Z 3 loop_ -_symmetry_equiv_pos_site_id -_symmetry_equiv_pos_as_xyz -1 x,y,z -_cell_length_a 26.34300000 -_cell_length_b 26.34300000 -_cell_length_c 26.34300000 -_cell_angle_alpha 90.00000000 -_cell_angle_beta 90.00000000 -_cell_angle_gamma 90.00000000 -_cell_volume 18280.8 + _symmetry_equiv_pos_site_id + _symmetry_equiv_pos_as_xyz + 1 'x, y, z' loop_ -_atom_site_label -_atom_site_type_symbol -_atom_site_fract_x -_atom_site_fract_y -_atom_site_fract_z -C378 C 0.32200000 0.67800000 0.61300000 -O379 O 0.44800000 0.74300000 0.68300000 -C380 C 0.43100000 0.70300000 0.70300000 -C381 C 0.29700000 0.70300000 0.56900000 -C382 C 0.32200000 0.61300000 0.67800000 -H383 H 0.38000000 0.72800000 0.62000000 -O384 O 0.31700000 0.74300000 0.55200000 -C385 C 0.38700000 0.67800000 0.67800000 -C386 C 0.30100000 0.63400000 0.63400000 -H387 H 0.27200000 0.62000000 0.62000000 -C388 C 0.36600000 0.63400000 0.69900000 -H389 H 0.38000000 0.62000000 0.72800000 -O390 O 0.44800000 0.68300000 0.74300000 -O391 O 0.25700000 0.68300000 0.55200000 -O392 O 0.25700000 0.55200000 0.68300000 -C393 C 0.36600000 0.69900000 0.63400000 -O394 O 0.31700000 0.55200000 0.74300000 -C395 C 0.29700000 0.56900000 0.70300000 - -#END + _atom_site_type_symbol + _atom_site_label + _atom_site_symmetry_multiplicity + _atom_site_fract_x + _atom_site_fract_y + _atom_site_fract_z + _atom_site_occupancy + H H0 1 -2.08665495 -1.27891755 -2.45013678 1 + O O1 1 -2.50062036 -1.50777648 -2.29868601 1 + C C2 1 -2.28185815 -1.30247656 -2.28185815 1 + O O3 1 -1.85779602 -0.86495213 -2.29868601 1 + C C4 1 -2.13377296 -1.22843396 -2.35253517 1 + O O5 1 -2.50062036 -1.06688648 -1.85779602 1 + H H6 1 -2.45013678 -1.27891755 -2.08665495 1 + C C7 1 -2.28185815 -1.08371434 -2.06309594 1 + C C8 1 -2.36599746 -0.99957503 -1.91501075 1 + C C9 1 -2.13377296 -1.01303732 -2.13377296 1 + O O10 1 -2.29868601 -0.86495213 -1.85779602 1 + O O11 1 -2.29868601 -1.50777648 -2.50062036 1 + C C12 1 -2.35253517 -1.22843396 -2.13377296 1 + C C13 1 -1.91501075 -0.99957503 -2.36599746 1 + C C14 1 -2.06309594 -1.08371434 -2.28185815 1 + H H15 1 -2.08665495 -0.91543572 -2.08665495 1 + O O16 1 -1.85779602 -1.06688648 -2.50062036 1 + C C17 1 -2.36599746 -1.45056175 -2.36599746 1 diff --git a/tests/test_files/COF-1.cif b/tests/test_files/COF-1.cif new file mode 100644 index 00000000..5c2a5709 --- /dev/null +++ b/tests/test_files/COF-1.cif @@ -0,0 +1,75 @@ +data_COF-1 + +_audit_creation_method RASPA-1.0 +_audit_creation_date 2011-3-7 +_audit_author_name 'David Dubbeldam' + +_citation_author_name 'A.P. Cote, A.I. Benin, N.W. Ockwig, M. O’Keeffe, A.J. Matzger, and O.M. Yaghi'' +_citation_title 'Porous, crystalline, covalent organic frameworks' +_citation_journal_abbrev 'Science' +_citation_journal_volume 310 +_citation_journal_number 5751 +_citation_page_first 1166 +_citation_page_last 1170 +_citation_year 2005 + +_cell_length_a 15.6529 +_cell_length_b 15.6529 +_cell_length_c 6.7005 +_cell_angle_alpha 90 +_cell_angle_beta 90 +_cell_angle_gamma 120 +_cell_volume 1421.76 + +_symmetry_cell_setting hexagonal +_symmetry_space_group_name_Hall '-P 6c 2c' +_symmetry_space_group_name_H-M 'P 63/m m c' +_symmetry_Int_Tables_number 194 + +loop_ +_symmetry_equiv_pos_as_xyz + 'x,y,z' + '-y,x-y,z' + '-x+y,-x,z' + '-x,-y,z+1/2' + 'y,-x+y,z+1/2' + 'x-y,x,z+1/2' + 'y,x,-z' + 'x-y,-y,-z' + '-x,-x+y,-z' + '-y,-x,-z+1/2' + '-x+y,y,-z+1/2' + 'x,x-y,-z+1/2' + '-x,-y,-z' + 'y,-x+y,-z' + 'x-y,x,-z' + 'x,y,-z+1/2' + '-y,x-y,-z+1/2' + '-x+y,-x,-z+1/2' + '-y,-x,z' + '-x+y,y,z' + 'x,x-y,z' + 'y,x,z+1/2' + 'x-y,-y,z+1/2' + '-x,-x+y,z+1/2' + +loop_ +_atom_site_label +_atom_site_type_symbol +_atom_site_fract_x +_atom_site_fract_y +_atom_site_fract_z +_atom_site_charge +B1 B 0.05772 0.11543 0.25 0 +B2 B 0.44466 0.72233 0.25 0 +O1 O 0.11133 0.05567 0.25 0 +O2 O 0.389 0.778 0.25 0 +C1 C 0.11184 0.38361 0.25 0 +C2 C 0.21864 0.43728 0.25 0 +C3 C 0.21837 0.27685 0.25 0 +C4 C 0.1103 0.8897 0.25 0 +H1 H 0.57703 0.92773 0.25 0 +H2 H 1.02045 0.76181 0.75 0 + + + diff --git a/tests/test_files/Fe-MOF-74.cif b/tests/test_files/Fe-MOF-74.cif new file mode 100644 index 00000000..2a6fd4bf --- /dev/null +++ b/tests/test_files/Fe-MOF-74.cif @@ -0,0 +1,77 @@ +data_crystal + +_cell_length_a 6.96775 +_cell_length_b 15.40000 +_cell_length_c 15.42110 +_cell_angle_alpha 117.56400 +_cell_angle_beta 98.67390 +_cell_angle_gamma 98.70470 + +_symmetry_space_group_name_Hall 'P 1' +_symmetry_space_group_name_H-M 'P 1' + +loop_ +_symmetry_equiv_pos_as_xyz + 'x,y,z' + +loop_ +_atom_site_label +_atom_site_type_symbol +_atom_site_fract_x +_atom_site_fract_y +_atom_site_fract_z +_atom_site_charge +C C 0.03198 0.90054 0.46577 0.1870000000 +C C 0.08103 0.46326 0.91497 -0.1740000000 +C C 0.13137 0.56584 0.09913 0.1870000000 +C C 0.16623 0.08519 0.54897 -0.1750000000 +C C 0.17945 0.43074 0.82994 0.3860000000 +C C 0.19214 0.98722 0.51494 -0.0710000000 +C C 0.20483 0.52779 0.01259 -0.0730000000 +C C 0.25129 0.60109 0.43035 0.3860000000 +C C 0.32290 0.51504 0.52768 -0.0720000000 +C C 0.34978 0.17032 0.60199 0.3860000000 +C C 0.38223 0.54845 0.46308 -0.1750000000 +C C 0.43442 0.46640 0.56592 0.1880000000 +C C 0.56558 0.53360 0.43408 0.1880000000 +C C 0.61777 0.45155 0.53692 -0.1750000000 +C C 0.65022 0.82968 0.39801 0.3860000000 +C C 0.67710 0.48496 0.47232 -0.0720000000 +C C 0.74871 0.39891 0.56965 0.3860000000 +C C 0.79517 0.47221 0.98741 -0.0730000000 +C C 0.80786 0.01278 0.48506 -0.0710000000 +C C 0.82055 0.56926 0.17006 0.3860000000 +C C 0.83377 0.91481 0.45103 -0.1750000000 +C C 0.86863 0.43416 0.90087 0.1870000000 +C C 0.91897 0.53674 0.08503 -0.1730000000 +C C 0.96802 0.09946 0.53423 0.1880000000 +H H 0.18345 0.52756 0.54920 0.0590000000 +H H 0.34407 0.97808 0.52739 0.0590000000 +H H 0.36584 0.54940 0.02155 0.0590000000 +H H 0.63416 0.45060 0.97845 0.0590000000 +H H 0.65593 0.02192 0.47261 0.0590000000 +H H 0.81655 0.47244 0.45080 0.0590000000 +O O 0.06914 0.36860 0.73969 -0.5690000000 +O O 0.07325 0.80916 0.43511 -0.5130000000 +O O 0.09643 0.61781 0.46347 -0.4540000000 +O O 0.26402 0.62706 0.19021 -0.5130000000 +O O 0.29956 0.62958 0.36820 -0.5690000000 +O O 0.32998 0.26088 0.63057 -0.5690000000 +O O 0.36301 0.43627 0.62724 -0.5130000000 +O O 0.36745 0.46401 0.84623 -0.4540000000 +O O 0.47868 0.84617 0.38113 -0.4540000000 +O O 0.52132 0.15383 0.61887 -0.4540000000 +O O 0.63255 0.53599 0.15377 -0.4540000000 +O O 0.63699 0.56373 0.37276 -0.5130000000 +O O 0.67002 0.73912 0.36943 -0.5690000000 +O O 0.70044 0.37042 0.63180 -0.5690000000 +O O 0.73598 0.37294 0.80979 -0.5130000000 +O O 0.90357 0.38219 0.53653 -0.4540000000 +O O 0.92675 0.19084 0.56489 -0.5130000000 +O O 0.93086 0.63140 0.26031 -0.5690000000 +Fe Fe 0.10807 0.32430 0.59200 1.1500000000 +Fe Fe 0.21621 0.73303 0.32332 1.1500000000 +Fe Fe 0.48320 0.59129 0.26667 1.1500000000 +Fe Fe 0.51680 0.40871 0.73333 1.1500000000 +Fe Fe 0.78379 0.26697 0.67668 1.1500000000 +Fe Fe 0.89193 0.67570 0.40800 1.1500000000 diff --git a/tests/test_matmineradapter.py b/tests/test_matmineradapter.py new file mode 100644 index 00000000..c6ae1bfe --- /dev/null +++ b/tests/test_matmineradapter.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +"""Test the MatminerAdapter.""" +import numpy as np +from matminer.featurizers.site import SOAP +from matminer.featurizers.structure import DensityFeatures, SiteStatsFingerprint +from pymatgen.core.structure import Structure + +from mofdscribe.featurizers.matmineradapter import MatminerAdapter +from mofdscribe.mof import MOF + + +def test_matminer_adapter(hkust_structure): + matminer_featurizer = DensityFeatures() + adapter = MatminerAdapter(matminer_featurizer) + features = adapter.featurize(MOF(hkust_structure)) + original_features = matminer_featurizer.featurize(hkust_structure) + + assert np.allclose(features, original_features) + + assert adapter.feature_labels() == matminer_featurizer.feature_labels() + assert adapter.citations() == matminer_featurizer.citations() + assert adapter.implementors() == matminer_featurizer.implementors() + + +def test_matminer_adapter_with_fit(hkust_structure): + original_featurizer = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)) + hkust_structure = Structure.from_sites(hkust_structure.sites) + original_featurizer.fit([hkust_structure]) + original_features = original_featurizer.featurize(hkust_structure) + + matminer_featurizer = SiteStatsFingerprint(SOAP(6, 8, 8, 0.4, False, "gto", False)) + adapter = MatminerAdapter(matminer_featurizer) + adapter.fit([MOF(hkust_structure)]) + features = adapter.featurize(MOF(hkust_structure)) + + assert np.allclose(features, original_features) diff --git a/tests/test_mof.py b/tests/test_mof.py new file mode 100644 index 00000000..fb12e1a9 --- /dev/null +++ b/tests/test_mof.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""test the MOF base class""" +from pymatgen.analysis.graphs import StructureGraph + +from mofdscribe.mof import MOF + + +def test_mof(hkust_structure): + mof = MOF(hkust_structure) + assert isinstance(mof.structure_graph, StructureGraph) + fragments = mof.fragments + assert hasattr(fragments, "linkers") + assert isinstance(mof.decorated_graph_hash, str) + assert isinstance(mof.decorated_scaffold_hash, str) + assert isinstance(mof.undecorated_graph_hash, str) + assert isinstance(mof.undecorated_scaffold_hash, str) + + +def test_mof_from_file(hkust_path): + mof = MOF.from_file(hkust_path) + sg_non_primitive = mof.structure_graph + assert isinstance(sg_non_primitive, StructureGraph) + fragments = mof.fragments + assert hasattr(fragments, "linkers") + assert isinstance(mof.decorated_graph_hash, str) + assert isinstance(mof.decorated_scaffold_hash, str) + assert isinstance(mof.undecorated_graph_hash, str) + assert isinstance(mof.undecorated_scaffold_hash, str) diff --git a/tests/utils/test_structure_graph.py b/tests/utils/test_structure_graph.py index 6a8233af..8ea4a227 100644 --- a/tests/utils/test_structure_graph.py +++ b/tests/utils/test_structure_graph.py @@ -1,11 +1,10 @@ # -*- coding: utf-8 -*- """Test helper functions for dealing with structure graphs.""" -from mofdscribe.featurizers.utils.structure_graph import get_neighbors_at_distance +from mofdscribe.featurizers.utils.structure_graph import get_neighbors_up_to_scope -def test_get_neighbors_at_distance(hkust_structure_graph): - """Test the get_neighbors_at_distance function - at some case studies (manually verified).""" - assert len(get_neighbors_at_distance(hkust_structure_graph, 1, 1)[1]) == 4 - - assert len(get_neighbors_at_distance(hkust_structure_graph, 1, 2)[1]) == 4 +def test_get_neighbors_up_to_scope(hkust_structure_graph): + res = get_neighbors_up_to_scope(hkust_structure_graph, 1, 3) + assert len(res[1]) == 4 + assert len(res[2]) == 4 + assert len(res[3]) == 8 diff --git a/tox.ini b/tox.ini index 2616e35e..66c0a385 100644 --- a/tox.ini +++ b/tox.ini @@ -11,7 +11,7 @@ envlist = lint #manifest pyroma - flake8 < 5 + flake8 #mypy # documentation linters/checkers #doc8 @@ -56,17 +56,17 @@ description = Run linters. skip_install = true deps = darglint - flake8 < 5 + flake8 flake8-black flake8-bugbear flake8-colors flake8-docstrings - flake8-isort < 5 + flake8-isort flake8-print pep8-naming pydocstyle commands = - flake8 src/mofdscribe/ tests/ setup.py + flake8 --color always src/mofdscribe/ tests/ setup.py description = Run the flake8 tool with several plugins (docstrings, import order, pep8 naming). See https://cthoyt.com/2020/04/25/how-to-code-with-me-flake8.html for more information. [testenv:pyroma]