diff --git a/README.md b/README.md index 7f6d89c..4987ea7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ Note: This package is under active development as of 2026, if you encounter any ## Table of Contents 1. **Installation** -2. **Basic Usage** +2. **Configuration Options (kwargs)** +3. **Basic Usage** +4. **PDB-Only Mode (No Multiwfn Required)** ## 1. Installation @@ -42,7 +44,49 @@ export PATH=/path/to/Multiwfn_3.8_bin_Linux_noGUI/Multiwfn_noGUI:$PATH Or use the full path when running PyEF (e.g., `multiwfn_path: /path/to/Multiwfn_3.8_bin_Linux_noGUI/Multiwfn_noGUI`). -## 2. Basic Usage +## 2. Configuration Options (kwargs) + +When creating an `Electrostatics` object, you can pass keyword arguments to customize the calculation. All are optional. + +### ECP (Effective Core Potential) Support + +If your QM calculation used ECPs, set `hasECP=True` and specify the ECP family: + +```python +es = Electrostatics(molden_paths, xyz_paths, hasECP=True, ECP="lacvps") +``` + +**Supported `ECP` values:** + +| ECP Value | Description | +|-----------|-------------| +| `"lacvps"` | (default) Hybrid: all-electron for Z ≤ 18 (up to Ar), LANL2DZ for heavier elements | +| `"lacvp"` | Hybrid: all-electron for Z ≤ 10 (up to Ne), LANL2DZ for heavier elements | +| `"lanl2dz"` | LANL2DZ — widely used ECP for transition metals | +| `"def2"` | def2-type ECPs (e.g., def2-SVP, def2-TZVP families) | +| `"crenbl"` | CRENBL shape-consistent relativistic ECPs | +| `"stuttgart_rsc"` | Stuttgart Relativistic Small Core ECPs | + +### All Configuration kwargs + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `hasECP` | bool | `False` | Whether the QM calculation used ECPs | +| `ECP` | str | `"lacvps"` | ECP basis set family (see above). Only used when `hasECP=True` | +| `dielectric` | float | `1` | Dielectric constant (1=vacuum, 4≈protein, 78.5≈water) | +| `dielectric_scale` | float | `1` | Scaling factor for point charges in dielectric treatment | +| `changeDielectBoundBool` | bool | `False` | Use dielectric=1 for bonded atoms | +| `includePtChgs` | bool | `False` | Include QM/MM point charges in calculations | +| `rerun` | bool | `False` | Force recalculation of charges (ignore cache) | +| `maxIHirshBasis` | int | `12000` | Max basis functions for Hirshfeld-I analysis | +| `maxIHirshFuzzyBasis` | int | `6000` | Max basis functions for fuzzy Hirshfeld-I analysis | +| `excludeAtomfromEcalc` | list | `[]` | Atom indices to exclude from E-field calculations | +| `skip_missing_files` | bool | `False` | Skip jobs with missing files instead of raising errors | +| `visualize_ef` | bool | `False` | Create PDB files with atom-wise E-field data | +| `visualize_charges` | bool | `False` | Create PDB files with partial charges | +| `visualize_per_bond` | bool | `False` | Create separate PDB files per bond | + +## 3. Basic Usage ### Electric Field Calculation @@ -204,4 +248,41 @@ spectrum b, blue_white_red, minimum=-1, maximum=1 --- +## 4. PDB-Only Mode (No Multiwfn Required) + +Skip the QM workflow entirely by placing your partial charges in the **B-factor column** of a PDB file. No `.molden`, `.xyz`, or Multiwfn needed. Monopole charges only. + +**Electric Field:** +```python +from pyef.analysis import Electrostatics +import numpy as np + +es = Electrostatics(pdb_charge_paths=['job1/charges.pdb', 'job2/charges.pdb'], dielectric=1.0) + +sub_indices = [np.arange(0, 20), np.arange(0, 20)] +es.setExcludeAtomFromCalc(sub_indices) + +df = es.getEfield(input_bond_indices=[[(25, 26)], [(25, 26)]], Efielddata_filename='ef_pdb') +``` + +**ESP:** +```python +es = Electrostatics(pdb_charge_paths=['job1/charges.pdb', 'job2/charges.pdb'], + esp_atom_idx=[30, 30], dielectric=4.0) + +df = es.getESP(ESPdata_filename='esp_pdb') +``` + +**Electrostatic Stabilization:** +```python +es = Electrostatics(pdb_charge_paths=['job1/charges.pdb', 'job2/charges.pdb'], dielectric=1.0) + +df = es.getElectrostatic_stabilization(substrate_idxs=[np.arange(0, 10), np.arange(0, 10)], + name_dataStorage='estab_pdb') +``` + +`pdb_charge_paths` can also be passed directly to any of the three calculation functions on a standard (molden/xyz) object to override charges for that call only. + +--- + **Authors:** Melissa Manetsch and David W. Kastner diff --git a/docs/api.rst b/docs/api.rst index 35e7604..3001103 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -50,7 +50,8 @@ Initialization molden_paths, # List of .molden file paths xyz_paths, # List of .xyz file paths esp_atom_idx=None, # Atom indices for ESP (0-indexed) - ptchg_paths=None # Point charge files for QM/MM + ptchg_paths=None, # Point charge files for QM/MM + **kwargs # Configuration options (see below) ) **Required parameters:** @@ -63,6 +64,193 @@ Initialization - ``esp_atom_idx`` (list of int): Atom indices for ESP calculations (0-indexed) - ``ptchg_paths`` (list): Point charge file paths for QM/MM calculations +.. _configuration-kwargs: + +Configuration kwargs +~~~~~~~~~~~~~~~~~~~~ + +The following keyword arguments can be passed to the ``Electrostatics`` constructor to +configure the calculation behavior. All are optional and have sensible defaults. + +**ECP (Effective Core Potential) Options:** + +.. list-table:: + :widths: 15 10 15 60 + :header-rows: 1 + + * - Parameter + - Type + - Default + - Description + * - ``hasECP`` + - bool + - ``False`` + - Set to ``True`` if your QM calculation used Effective Core Potentials. When enabled, pyEF will + automatically reformat molden files to fix ECP-related artifacts that make them incompatible with Multiwfn. + * - ``ECP`` + - str + - ``"lacvps"`` + - The ECP basis set family used in the QM calculation. Only relevant when ``hasECP=True``. + See :ref:`supported ECPs ` below. + +.. _supported-ecps: + +**Supported ECP basis set families:** + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - ECP Value + - Description + * - ``"lacvps"`` + - LACVPS (default). Hybrid ECP: all-electron for elements with Z |le| 18 (up to Ar), + LANL2DZ ECP for heavier elements. Common for TeraChem calculations with transition metals. + * - ``"lacvp"`` + - LACVP. Hybrid ECP: all-electron for elements with Z |le| 10 (up to Ne), + LANL2DZ ECP for heavier elements. + * - ``"lanl2dz"`` + - LANL2DZ (Los Alamos National Laboratory 2-Double-Zeta). Widely used ECP for transition metals. + * - ``"def2"`` + - def2-type ECPs (e.g., def2-SVP, def2-TZVP). Stuttgart/Cologne group ECPs used in the Ahlrichs basis set family. + * - ``"crenbl"`` + - CRENBL (Christiansen, Ross, Ermler, Nash, Bursten, Large-core). Shape-consistent relativistic ECPs. + * - ``"stuttgart_rsc"`` + - Stuttgart RSC (Relativistic Small Core). Energy-consistent ECPs from the Stuttgart/Cologne group. + +.. |le| unicode:: U+2264 + +**Example with ECP:** + +.. code-block:: python + + # For a TeraChem calculation that used LACVPS basis set + es = Electrostatics(molden_paths, xyz_paths, hasECP=True, ECP="lacvps") + + # For a calculation that used def2-type ECPs + es = Electrostatics(molden_paths, xyz_paths, hasECP=True, ECP="def2") + +**Dielectric and Environment Options:** + +.. list-table:: + :widths: 25 10 10 55 + :header-rows: 1 + + * - Parameter + - Type + - Default + - Description + * - ``dielectric`` + - float + - ``1`` + - Dielectric constant for the environment. Common values: 1.0 (vacuum), 2-4 (protein interior), + 20-40 (protein-solvent interface), 78.5 (water). Can also be overridden per method call. + * - ``dielectric_scale`` + - float + - ``1`` + - Scaling factor applied to point charges in the dielectric treatment. Can also be overridden per method call. + * - ``changeDielectBoundBool`` + - bool + - ``False`` + - When ``True``, uses dielectric=1 for atoms that are directly bonded, applying the dielectric + scaling only to non-bonded interactions. Useful for modeling distinct dielectric boundaries. + * - ``excludeAtomfromEcalc`` + - list + - ``[]`` + - List of atom indices to exclude from E-field calculations. Typically used to exclude substrate + atoms when probing only the environment's contribution. Can also be set via ``setExcludeAtomFromCalc()``. + +**QM/MM Point Charge Options:** + +.. list-table:: + :widths: 20 10 10 60 + :header-rows: 1 + + * - Parameter + - Type + - Default + - Description + * - ``includePtChgs`` + - bool + - ``False`` + - Include QMMM point charges in ESP/E-field calculations. Point charge files must be provided + via ``ptchg_paths`` or ``ptChgfp``. + * - ``ptChgfp`` + - str + - ``''`` + - Legacy API: a single point charge filename applied to all jobs. Prefer using ``ptchg_paths`` instead. + +**Computation Options:** + +.. list-table:: + :widths: 25 10 10 55 + :header-rows: 1 + + * - Parameter + - Type + - Default + - Description + * - ``rerun`` + - bool + - ``False`` + - Force recalculation of charges even if cached results exist. By default, pyEF caches Multiwfn + charge results and reuses them on subsequent runs. + * - ``maxIHirshBasis`` + - int + - ``12000`` + - Maximum number of basis functions allowed for Hirshfeld-I analysis. Increase for very large systems. + * - ``maxIHirshFuzzyBasis`` + - int + - ``6000`` + - Maximum number of basis functions allowed for fuzzy Hirshfeld-I analysis. + * - ``skip_missing_files`` + - bool + - ``False`` + - When ``True``, skip jobs where required input files are missing instead of raising an error. + Useful for batch processing where some calculations may have failed. + +**Visualization Options:** + +.. list-table:: + :widths: 20 10 10 60 + :header-rows: 1 + + * - Parameter + - Type + - Default + - Description + * - ``visualize_ef`` + - bool + - ``False`` + - Create PDB files with atom-wise electric field contributions in the B-factor column. + * - ``visualize_charges`` + - bool + - ``False`` + - Create PDB files with partial charges in the B-factor column. + * - ``visualize_per_bond`` + - bool + - ``False`` + - Create separate PDB files for each bond in E-field calculations. + +**Legacy/Compatibility Options:** + +.. list-table:: + :widths: 20 10 25 45 + :header-rows: 1 + + * - Parameter + - Type + - Default + - Description + * - ``molden_filename`` + - str + - ``'final_optim.molden'`` + - Molden filename for backward compatibility with older directory-based workflows. + * - ``xyzfilename`` + - str + - ``'final_optim.xyz'`` + - XYZ filename for backward compatibility with older directory-based workflows. + .. _getefield-electric-field: getEfield() - Electric Field @@ -71,24 +259,39 @@ getEfield() - Electric Field .. code-block:: python df = es.getEfield( - charge_types, # 'Hirshfeld_I', 'CHELPG', etc. - Efielddata_filename, # Output filename prefix - multiwfn_path, # Path to Multiwfn executable - input_bond_indices=[], # [(atom1, atom2), ...] - multipole_bool=False, # Use multipole expansion - dielectric=1 # Dielectric constant + charge_types='Hirshfeld_I', # Charge scheme(s) + Efielddata_filename='ef', # Output filename prefix + multiwfn_path=None, # Path to Multiwfn executable + input_bond_indices=[], # [(atom1, atom2), ...] per structure + auto_find_bonds=False, # Auto-detect bonds to adjacent atoms + multipole_bool=False, # Use multipole expansion + save_atomwise_decomposition=False, # Save per-atom contributions + visualize=None, # Create PDB visualization files + dielectric=1, # Dielectric constant + dielectric_scale=1, # Dielectric scaling for point charges + includePtChgs=None, # Override point charge setting + ptchg_paths=None, # Override point charge file paths + pdb_charge_paths=None # Use charges from PDB B-factor column ) **Parameters:** -- ``charge_types`` (str or list): Charge scheme(s) to use -- ``Efielddata_filename`` (str): Output CSV filename prefix +- ``charge_types`` (str or list): Charge scheme(s) to use (first entry used if list) +- ``Efielddata_filename`` (str): Output CSV filename prefix (default: ``'ef'``) - ``multiwfn_path`` (str): Path to Multiwfn executable -- ``input_bond_indices`` (list of tuples): Bond pairs as (atom1, atom2) -- ``multipole_bool`` (bool): True for multipole expansion, False for monopole -- ``dielectric`` (float): Dielectric constant - -**Returns:** DataFrame with electric field results +- ``input_bond_indices`` (list of tuples): Bond pairs as ``(atom1, atom2)`` per structure +- ``auto_find_bonds`` (bool): Automatically find bonds to adjacent atoms (default: ``False``) +- ``multipole_bool`` (bool): ``True`` for multipole expansion, ``False`` for monopole (default: ``False``) +- ``save_atomwise_decomposition`` (bool): Save per-atom E-field breakdown (default: ``False``) +- ``visualize`` (bool or None): Create PDB files with E-field data in B-factor column +- ``dielectric`` (float): Dielectric constant (default: ``1``) +- ``dielectric_scale`` (float): Scaling factor for point charges (default: ``1``) +- ``includePtChgs`` (bool or None): Override the class-level point charge inclusion setting +- ``ptchg_paths`` (list or None): Override point charge file paths +- ``pdb_charge_paths`` (str or list): Read charges from PDB B-factor column instead of computing them (monopole only) + +**Returns:** DataFrame with electric field results. If ``save_atomwise_decomposition=True``, +returns a tuple ``(total_df, atomwise_df)``. .. _getesp-electrostatic-potential: @@ -98,20 +301,32 @@ getESP() - Electrostatic Potential .. code-block:: python df = es.getESP( - charge_types, # 'Hirshfeld_I', 'CHELPG', etc. - ESPdata_filename, # Output filename prefix - multiwfn_path, # Path to Multiwfn executable - use_multipole=False, # Use multipole expansion - dielectric=1 # Dielectric constant + charge_types='CM5', # Charge scheme(s) + ESPdata_filename='ESP', # Output filename prefix + multiwfn_path='None', # Path to Multiwfn executable + use_multipole=False, # Use multipole expansion + include_decay=False, # Include distance-sorted ESP decay + include_coord_shells=False, # Include coordination shell ESP + visualize=None, # Create PDB visualization files + dielectric=1, # Dielectric constant + dielectric_scale=1, # Dielectric scaling for point charges + includePtChgs=None, # Override point charge setting + ptchg_paths=None # Override point charge file paths ) **Parameters:** - ``charge_types`` (str or list): Charge scheme(s) to use -- ``ESPdata_filename`` (str): Output CSV filename prefix +- ``ESPdata_filename`` (str): Output CSV filename prefix (default: ``'ESP'``) - ``multiwfn_path`` (str): Path to Multiwfn executable -- ``use_multipole`` (bool): True for multipole expansion -- ``dielectric`` (float): Dielectric constant +- ``use_multipole`` (bool): Use multipole expansion instead of monopole (default: ``False``) +- ``include_decay`` (bool): Include distance-sorted ESP decay analysis (default: ``False``) +- ``include_coord_shells`` (bool): Include coordination shell ESP breakdown (default: ``False``) +- ``visualize`` (bool or None): Create PDB files with ESP in B-factor column +- ``dielectric`` (float): Dielectric constant (default: ``1``) +- ``dielectric_scale`` (float): Scaling factor for point charges (default: ``1``) +- ``includePtChgs`` (bool or None): Override the class-level point charge inclusion setting +- ``ptchg_paths`` (list or None): Override point charge file paths **Returns:** DataFrame with ESP results @@ -125,20 +340,22 @@ getCharges() - Partial Charges .. code-block:: python df = es.getCharges( - charge_types='Hirshfeld_I', # Charge scheme(s) - multiwfn_path='/path/to/multiwfn', - output_filename='charges', # Output filename prefix - write_pdb=False # Write PDB files with charges + charge_types='Hirshfeld_I', # Charge scheme(s) + multiwfn_path=None, # Path to Multiwfn executable + output_filename='charges', # Output filename prefix + write_pdb=False, # Write PDB files with charges + pdb_bfactor=True # Store charges in B-factor column ) **Parameters:** - ``charge_types`` (str or list): Charge scheme(s) to use - ``multiwfn_path`` (str): Path to Multiwfn executable -- ``output_filename`` (str): Output CSV filename prefix -- ``write_pdb`` (bool): If True, write PDB files with charges in B-factor column +- ``output_filename`` (str): Output CSV filename prefix (default: ``'charges'``) +- ``write_pdb`` (bool): If ``True``, write PDB files with charges in B-factor column (default: ``False``) +- ``pdb_bfactor`` (bool): Store charges in B-factor column of PDB (default: ``True``) -**Returns:** DataFrame with columns: Job, Charge_Type, Atom_Index, Element, x, y, z, Charge, Molden_Path, XYZ_Path +**Returns:** DataFrame with columns: ``Job``, ``Charge_Type``, ``Atom_Index``, ``Element``, ``x``, ``y``, ``z``, ``Charge``, ``Molden_Path``, ``XYZ_Path`` .. _getelectrostatic-stabilization-electrostatic-stabilization: @@ -148,22 +365,46 @@ getElectrostatic_stabilization() - Electrostatic Stabilization .. code-block:: python df = es.getElectrostatic_stabilization( - multiwfn_path='/path/to/multiwfn', - substrate_idxs=[], # Substrate atom indices per structure - charge_type='Hirshfeld_I', # Charge scheme - multipole_order=2, # 1 for monopole, 2 for dipole - dielectric=1 # Dielectric constant + multiwfn_path=None, # Path to Multiwfn executable + substrate_idxs=[], # Substrate atom indices per structure + charge_type='Hirshfeld_I', # Charge scheme + name_dataStorage='estatic', # Output filename prefix + env_idxs=None, # Environment atom indices (default: all non-substrate) + save_atomwise_decomposition=False, # Save per-atom contributions + visualize=None, # Create PDB visualization files + multipole_order=2, # 1=monopole, 2=dipole, 3=quadrupole + substrate_multipole_order=None, # Override multipole order for substrate + env_multipole_order=None, # Override multipole order for environment + dielectric=1, # Dielectric constant + dielectric_scale=1, # Dielectric scaling for point charges + includePtChgs=None, # Override point charge setting + ptchg_paths=None # Override point charge file paths ) **Parameters:** - ``multiwfn_path`` (str): Path to Multiwfn executable -- ``substrate_idxs`` (list): List of atom indices for substrate per structure -- ``charge_type`` (str): Charge scheme (Hirshfeld, Hirshfeld_I, or Becke for multipole_order=2) -- ``multipole_order`` (int): 1 for monopole, 2 for dipole corrections -- ``dielectric`` (float): Dielectric constant - -**Returns:** DataFrame with electrostatic stabilization energies +- ``substrate_idxs`` (list): List of atom indices for the substrate, one list per structure +- ``charge_type`` (str): Charge scheme (default: ``'Hirshfeld_I'``). Must be Hirshfeld, Hirshfeld_I, or Becke for ``multipole_order`` >= 2 +- ``name_dataStorage`` (str): Output filename prefix (default: ``'estatic'``) +- ``env_idxs`` (list or None): Environment atom indices. If ``None``, uses all non-substrate atoms +- ``save_atomwise_decomposition`` (bool): Save per-atom energy breakdown (default: ``False``) +- ``visualize`` (bool or None): Create PDB files with stabilization data +- ``multipole_order`` (int): Multipole expansion order (default: ``2``): + + - ``1``: Monopole only (charge-charge interactions) + - ``2``: Monopole + dipole (charge-charge, charge-dipole, dipole-dipole) + - ``3``: Monopole + dipole + quadrupole (all terms up to quadrupole-quadrupole) + +- ``substrate_multipole_order`` (int or None): Override ``multipole_order`` for the substrate only +- ``env_multipole_order`` (int or None): Override ``multipole_order`` for the environment only +- ``dielectric`` (float): Dielectric constant (default: ``1``) +- ``dielectric_scale`` (float): Scaling factor for point charges (default: ``1``) +- ``includePtChgs`` (bool or None): Include point charges as part of the environment +- ``ptchg_paths`` (list or None): Override point charge file paths + +**Returns:** DataFrame with electrostatic stabilization energies (kcal/mol). +If ``save_atomwise_decomposition=True``, returns a tuple ``(total_df, atomwise_df)``. Charge Schemes -------------- diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 1589fac..934c9ec 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,8 +1,9 @@ Getting Started =============== -**Jump to:** `Installation`_ · `getCharges()`_ · `getEfield()`_ · `getESP()`_ · `getElectrostatic_stabilization()`_ +**Jump to:** `Installation`_ · `Configuration kwargs`_ · `getCharges()`_ · `getEfield()`_ · `getESP()`_ · `getElectrostatic_stabilization()`_ +.. _Configuration kwargs: `Initialization Configuration (kwargs)`_ .. _getCharges(): `Computing Partial Charges`_ .. _getEfield(): `Computing Electric Fields`_ .. _getESP(): `Computing Electrostatic Potentials`_ @@ -99,6 +100,82 @@ Supported Charge Schemes - ⚠️ **Limitation**: Missing parameters for some elements including Na, K, and most transition metals. Will fail if your system contains unsupported elements +.. _initialization-kwargs: + +Initialization Configuration (kwargs) +-------------------------------------- + +When creating an ``Electrostatics`` object, you can pass keyword arguments to configure the +calculation behavior. All kwargs are optional. + +**ECP (Effective Core Potential) support:** + +If your QM calculation used ECPs, you must tell pyEF so it can fix molden file artifacts: + +.. code-block:: python + + # For a TeraChem calculation with LACVPS (default ECP) + es = Electrostatics(molden_paths, xyz_paths, hasECP=True, ECP="lacvps") + + # For a calculation with Stuttgart RSC ECPs + es = Electrostatics(molden_paths, xyz_paths, hasECP=True, ECP="stuttgart_rsc") + +Supported ``ECP`` values: + +- ``"lacvps"`` (default) -- Hybrid: all-electron for Z |le| 18, LANL2DZ for heavier elements +- ``"lacvp"`` -- Hybrid: all-electron for Z |le| 10, LANL2DZ for heavier elements +- ``"lanl2dz"`` -- LANL2DZ ECP, widely used for transition metals +- ``"def2"`` -- def2-type ECPs (e.g., def2-SVP, def2-TZVP families) +- ``"crenbl"`` -- CRENBL shape-consistent relativistic ECPs +- ``"stuttgart_rsc"`` -- Stuttgart Relativistic Small Core ECPs + +.. |le| unicode:: U+2264 + +**Dielectric environment:** + +.. code-block:: python + + # Protein interior (dielectric ~ 4) + es = Electrostatics(molden_paths, xyz_paths, dielectric=4.0) + + # Water solvent (dielectric ~ 78.5) + es = Electrostatics(molden_paths, xyz_paths, dielectric=78.5) + + # Use dielectric=1 for bonded atoms (special boundary treatment) + es = Electrostatics(molden_paths, xyz_paths, dielectric=4.0, changeDielectBoundBool=True) + +**QM/MM point charges:** + +.. code-block:: python + + # New API: per-job point charge paths + es = Electrostatics(molden_paths, xyz_paths, + ptchg_paths=['/path/to/ptchg1.txt', '/path/to/ptchg2.txt'], + includePtChgs=True) + +**Computation options:** + +.. code-block:: python + + # Force recalculation (ignore cached charges) + es = Electrostatics(molden_paths, xyz_paths, rerun=True) + + # Skip missing files in batch processing + es = Electrostatics(molden_paths, xyz_paths, skip_missing_files=True) + + # Increase basis function limit for very large systems + es = Electrostatics(molden_paths, xyz_paths, maxIHirshBasis=20000) + +**Visualization:** + +.. code-block:: python + + # Enable PDB output for E-fields and charges + es = Electrostatics(molden_paths, xyz_paths, + visualize_ef=True, visualize_charges=True) + +For the complete list of all kwargs and their defaults, see the :ref:`API Reference `. + Computing Partial Charges ------------------------- diff --git a/pyef/analysis.py b/pyef/analysis.py index 3039b69..6d1b78d 100644 --- a/pyef/analysis.py +++ b/pyef/analysis.py @@ -110,18 +110,20 @@ class Electrostatics: >>> data = es_with_metals.getESP(['Hirshfeld', 'Hirshfeld_I'], 'esp_results', ...) """ - def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, - ptchg_paths=None, **kwargs): + def __init__(self, molden_paths=None, xyz_paths=None, esp_atom_idx=None, + ptchg_paths=None, pdb_charge_paths=None, **kwargs): """Initialize Electrostatics analysis object. Parameters ---------- - molden_paths : list of str + molden_paths : list of str or None List of .molden file paths for each job (absolute or relative). Example: ['/path/to/job1/optim.molden', '/path/to/job2/optim.molden'] - xyz_paths : list of str + Not required when pdb_charge_paths is provided (pdb_only mode). + xyz_paths : list of str or None List of .xyz file paths for each job (absolute or relative). Example: ['/path/to/job1/optim.xyz', '/path/to/job2/optim.xyz'] + Not required when pdb_charge_paths is provided (pdb_only mode). esp_atom_idx : list of int, optional List of atom indices for ESP calculation (0-indexed). Only required when running ESP analysis. Not needed for E-field @@ -131,6 +133,12 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, Use None for jobs without point charges. Example: ['/path/to/job1/ptchg.txt', None, '/path/to/job3/ptchg.txt'] If not provided, can use includePtChgs() or ptChgfp config for compatibility. + pdb_charge_paths : list of str or None, optional + List of PDB file paths whose B-factor column contains partial charges, + one per job. When provided, molden_paths and xyz_paths are not required + and Multiwfn is never called (pdb_only mode). Coordinates and charges + are read entirely from these PDB files. + Example: ['/path/to/job1/charges.pdb', '/path/to/job2/charges.pdb'] **kwargs : dict, optional Configuration options to override defaults. See class docstring for available configuration keys. @@ -138,6 +146,10 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, Notes ----- ECP options: "stuttgart_rsc", "def2", "crenbl", "lanl2dz", "lacvps" + + pdb_only mode: Pass pdb_charge_paths and omit (or pass None for) molden_paths + and xyz_paths. All three calculation functions (getESP, getEfield, + getElectrostatic_stabilization) will use the PDB charges directly. """ # Backward compatibility: accept old keyword lst_of_tmcm_idx if 'lst_of_tmcm_idx' in kwargs: @@ -153,10 +165,14 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, ) esp_atom_idx = kwargs.pop('lst_of_tmcm_idx') - # Validate inputs - # Type check: molden_paths and xyz_paths must be lists - if not isinstance(molden_paths, list): - raise TypeError(f""" + # Detect pdb_only mode: pdb_charge_paths provided without molden/xyz + _pdb_only = pdb_charge_paths is not None and molden_paths is None and xyz_paths is None + + # Validate inputs (skip for pdb_only mode) + if not _pdb_only: + # Type check: molden_paths and xyz_paths must be lists + if not isinstance(molden_paths, list): + raise TypeError(f""" {'='*60} ERROR: Invalid type for molden_paths {'='*60} @@ -169,14 +185,20 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, xyz_paths=['job1.xyz', 'job2.xyz'] ) +For pdb_only mode (no Multiwfn required): + estat = Electrostatics( + pdb_charge_paths=['job1.pdb', 'job2.pdb'], + esp_atom_idx=[25, 30] + ) + For more examples, see: - /home/gridsan/mmanetsch/pyEF/pyef/ExampleUsage.py - README.md: Section 3 (Detailed Tutorials) {'='*60} """) - if not isinstance(xyz_paths, list): - raise TypeError(f""" + if not isinstance(xyz_paths, list): + raise TypeError(f""" {'='*60} ERROR: Invalid type for xyz_paths {'='*60} @@ -195,9 +217,9 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, {'='*60} """) - # Length check - if len(molden_paths) != len(xyz_paths): - raise ValueError(f""" + # Length check + if len(molden_paths) != len(xyz_paths): + raise ValueError(f""" {'='*60} ERROR: Mismatched list lengths {'='*60} @@ -364,8 +386,14 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, validate_charge_type(charge_type, context="Electrostatics initialization") # Store file paths directly - self.molden_paths = [os.path.abspath(p) for p in molden_paths] - self.xyz_paths = [os.path.abspath(p) for p in xyz_paths] + if _pdb_only: + # pdb_only mode: molden/xyz are not used; store None sentinels + n_jobs = len(pdb_charge_paths) + self.molden_paths = [None] * n_jobs + self.xyz_paths = [None] * n_jobs + else: + self.molden_paths = [os.path.abspath(p) for p in molden_paths] + self.xyz_paths = [os.path.abspath(p) for p in xyz_paths] self.esp_atom_idx = esp_atom_idx if esp_atom_idx is not None else [] # Handle point charge paths with backward compatibility @@ -433,8 +461,16 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, # No point charges specified self.ptchg_paths = None - # Create list of working directories (one per job) from the molden file paths - self.lst_of_folders = [os.path.dirname(p) for p in self.molden_paths] + # Store pdb_only mode flag and PDB paths + self.pdb_only = _pdb_only + if _pdb_only: + self.pdb_charge_paths = [os.path.abspath(p) for p in pdb_charge_paths] + # Working directories come from PDB file locations + self.lst_of_folders = [os.path.dirname(p) for p in self.pdb_charge_paths] + else: + self.pdb_charge_paths = None + # Create list of working directories (one per job) from the molden file paths + self.lst_of_folders = [os.path.dirname(p) for p in self.molden_paths] self.folder_to_file_path = '' # No longer needed, but kept for compatibility # Initialize Multiwfn interface with current config @@ -448,12 +484,14 @@ def __init__(self, molden_paths, xyz_paths, esp_atom_idx=None, self.periodic_table = constants.PERIODIC_TABLE self.amassdict = constants.ATOMIC_MASS_DICT - # Prepare data files (extract final frames, check file existence) - self.prepData() - - # Fix molden files if ECP was used in calculations - if self.config['hasECP']: - self.fix_allECPmolden() + # Prepare data files (validate file existence) + if self.pdb_only: + self.prepData_pdb() + else: + self.prepData() + # Fix molden files if ECP was used in calculations + if self.config['hasECP']: + self.fix_allECPmolden() # ========================================================================= # Validation Methods @@ -928,6 +966,43 @@ def prepData(self): print(f'\n > Validation complete: {len(valid_indices)} valid job(s) ready for processing') + def prepData_pdb(self): + """Validate PDB charge files for pdb_only mode. + + Checks that all PDB files provided during initialization exist. + Removes invalid entries from processing lists. + + Raises + ------ + FileNotFoundError + If no valid PDB charge files are found. + """ + print(' > Validating PDB charge file paths (pdb_only mode)') + + valid_indices = [] + for idx, pdb_path in enumerate(self.pdb_charge_paths): + if not os.path.exists(pdb_path): + print(f'ERROR: PDB charge file not found: {pdb_path}') + continue + print(f' ✓ Job {idx+1}: Found PDB charge file') + print(f' - PDB: {pdb_path}') + valid_indices.append(idx) + + if len(valid_indices) == 0: + raise FileNotFoundError("No valid PDB charge files found!") + + if len(valid_indices) < len(self.pdb_charge_paths): + print(f'\nWARNING: Only {len(valid_indices)}/{len(self.pdb_charge_paths)} jobs have valid PDB files.') + print(' Invalid jobs will be removed from processing.') + self.pdb_charge_paths = [self.pdb_charge_paths[i] for i in valid_indices] + self.molden_paths = [self.molden_paths[i] for i in valid_indices] + self.xyz_paths = [self.xyz_paths[i] for i in valid_indices] + self.lst_of_folders = [self.lst_of_folders[i] for i in valid_indices] + if self.esp_atom_idx: + self.esp_atom_idx = [self.esp_atom_idx[i] for i in valid_indices] + + print(f'\n > Validation complete: {len(valid_indices)} valid pdb_only job(s) ready for processing') + # ========================================================================= # Multipole and Charge Parsing Methods # ========================================================================= @@ -1194,6 +1269,51 @@ def monopole_esp(self, path_to_xyz, espatom_idx, charge_range, charge_file, ptch final_esp = constants.COULOMB_CONSTANT*total_esp*constants.ELEMENTARY_CHARGE #N*m^2/(C^2)*(C/m) = N*m/C = J/C = Volt return [final_esp, df['Atom'][idx_atom]] + def monopole_esp_df(self, df_charges, espatom_idx, charge_range): + """Calculate ESP at a specified atom using charges from a DataFrame. + + This is the pdb_only equivalent of monopole_esp(): it reads directly + from a DataFrame (e.g. produced by getPdbCharges) instead of a file. + The changeDielectBoundBool feature is not supported in this mode because + bond detection requires an XYZ file. + + Parameters + ---------- + df_charges : pd.DataFrame + DataFrame with columns ['Atom', 'x', 'y', 'z', 'charge']. + Typically returned by getPdbCharges(). + espatom_idx : int + Atom index (0-indexed) at which the ESP is calculated. + charge_range : list of int + Indices of atoms to include in the ESP sum. + + Returns + ------- + list + [ESP (float in Volts), atom_symbol (str)] + """ + dielectric = self.config['dielectric'] + df = df_charges.reset_index(drop=True) + atoms = list(df['Atom']) + charges = list(df['charge']) + xs = list(df['x']) + ys = list(df['y']) + zs = list(df['z']) + + idx_atom = espatom_idx + xo, yo, zo = xs[idx_atom], ys[idx_atom], zs[idx_atom] + total_esp = 0 + + for idx in charge_range: + if idx == idx_atom: + continue + r = (((xs[idx] - xo) * constants.ANGSTROM_TO_M)**2 + + ((ys[idx] - yo) * constants.ANGSTROM_TO_M)**2 + + ((zs[idx] - zo) * constants.ANGSTROM_TO_M)**2) ** 0.5 + total_esp += (1 / dielectric) * (charges[idx] / r) + + final_esp = constants.COULOMB_CONSTANT * total_esp * constants.ELEMENTARY_CHARGE + return [final_esp, atoms[idx_atom]] def calc_firstTermE(espatom_idx, charge_range, charge_file): ''' @@ -1651,7 +1771,13 @@ def calc_bond_dipoles(self, bond_indices, xyz_filepath, atom_multipole_file, boo bond_dipoles = [] # Get geometry information - df_geom = Geometry(xyz_filepath).getGeomInfo() + # In pdb_only mode xyz_filepath is None; use coordinates already in charge_df. + if xyz_filepath is None and charge_df is not None: + df_geom = charge_df[['x', 'y', 'z']].rename( + columns={'x': 'X', 'y': 'Y', 'z': 'Z'} + ).reset_index(drop=True) + else: + df_geom = Geometry(xyz_filepath).getGeomInfo() # Get charge information if charge_df is not None: @@ -2034,7 +2160,8 @@ def esp_bydistance(self, path_to_xyz, espatom_idx, charge_file): def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None', use_multipole=False, include_decay=False, include_coord_shells=False, - visualize=None, dielectric=1, dielectric_scale=1, includePtChgs=None, ptchg_paths=None): + visualize=None, dielectric=1, dielectric_scale=1, includePtChgs=None, ptchg_paths=None, + pdb_charge_paths=None): """Unified ESP calculation method supporting multiple modes. This consolidated method replaces getESPData(), getESPMultipole(), and getESPDecay() @@ -2073,6 +2200,11 @@ def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None Override point charge file paths for this calculation. List of paths (one per job), with None for jobs without point charges. If provided, takes precedence over object-level settings (default: None). + pdb_charge_paths : str or list of str or None, optional + If provided, use partial charges from PDB B-factor columns only (no Multiwfn). + Can be a single PDB path or a list of paths (one per job), with None to skip a job. + Overrides the object-level pdb_charge_paths if the object was created in pdb_only mode. + This mode supports monopole charges only (use_multipole must be False). Returns ------- @@ -2106,9 +2238,20 @@ def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None >>> # Decay analysis mode (replaces getESPDecay) >>> es.getESP(['Hirshfeld'], 'esp_decay', ..., include_decay=True, include_coord_shells=True) + + >>> # pdb_only mode (no Multiwfn required) + >>> es.getESP(pdb_charge_paths=['job1.pdb', 'job2.pdb']) """ - # Warn if multiwfn_path is not provided - if multiwfn_path is None or multiwfn_path == 'None': + # Resolve pdb_charge_paths: function-level arg takes priority, else use object-level + if pdb_charge_paths is None and self.pdb_only: + pdb_charge_paths = self.pdb_charge_paths + use_pdb_charges = pdb_charge_paths is not None + + if use_pdb_charges and use_multipole: + raise ValueError("pdb_only mode supports monopole charges only. Set use_multipole=False.") + + # Warn if multiwfn_path is not provided (skip warning in pdb_only mode) + if not use_pdb_charges and (multiwfn_path is None or multiwfn_path == 'None'): warnings.warn( "multiwfn_path is not set. Multiwfn is required for charge partitioning. " "Consider using pyef.utility.get_openbabel_charges() as an alternative for " @@ -2186,10 +2329,29 @@ def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None results_dict['Name'] = f # Use stored absolute path instead of constructing it file_path_xyz = self.xyz_paths[counter] + + # pdb_only: resolve and load PDB charges for this job + df_pdb_charges = None + if use_pdb_charges: + if isinstance(pdb_charge_paths, list): + pdb_charge_path = pdb_charge_paths[counter] if counter < len(pdb_charge_paths) else None + else: + pdb_charge_path = pdb_charge_paths + if pdb_charge_path is None: + print(f"Warning: No PDB charge file for job {counter}, skipping") + counter += 1 + continue + pdb_charge_path = os.path.abspath(pdb_charge_path) + if not os.path.exists(pdb_charge_path): + raise FileNotFoundError(f"PDB charge file not found: {pdb_charge_path}") + df_pdb_charges = self.getPdbCharges(pdb_charge_path) + n_atoms = len(df_pdb_charges) + all_lines = [x for x in range(n_atoms) if x not in self.config['excludeAtomfromEcalc']] + counter += 1 - # Calculate total lines for monopole calculations - if not use_multipole or include_decay: + # Calculate total lines for monopole calculations (standard mode) + if not use_pdb_charges and (not use_multipole or include_decay): total_lines = Electrostatics.mapcount(file_path_xyz) init_all_lines = range(0, total_lines - 2) all_lines = [x for x in init_all_lines if x not in self.config['excludeAtomfromEcalc']] @@ -2201,11 +2363,15 @@ def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None file_path_multipole = f"{f}/Multipole{charge_type}.txt" file_path_monopole = f"{f}/Charges{charge_type}.txt" - # Check if required files exist when skip_missing_files is enabled - if self.config.get('skip_missing_files', False): - required_file = file_path_multipole if use_multipole else file_path_monopole - if not os.path.exists(required_file): - print(f""" + if use_pdb_charges: + # pdb_only: skip partitionCharge entirely + pass + else: + # Check if required files exist when skip_missing_files is enabled + if self.config.get('skip_missing_files', False): + required_file = file_path_multipole if use_multipole else file_path_monopole + if not os.path.exists(required_file): + print(f""" {'='*60} WARNING: Required charge file not found - skipping this calculation {'='*60} @@ -2216,25 +2382,25 @@ def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None Skipping to next charge type (skip_missing_files=True) {'='*60} """) - continue + continue - # Use centralized partitionCharge() to get/compute charges - # Extract actual filenames from paths - molden_filename = os.path.basename(self.molden_paths[counter-1]) - xyz_filename = os.path.basename(self.xyz_paths[counter-1]) - comp_cost = self.multiwfn.partitionCharge( - multipole_bool=use_multipole, - f=f, - multiwfn_path=multiwfn_path, - charge_type=charge_type, - owd=owd, - molden_filename=molden_filename, - xyz_filename=xyz_filename - ) + # Use centralized partitionCharge() to get/compute charges + # Extract actual filenames from paths + molden_filename = os.path.basename(self.molden_paths[counter-1]) + xyz_filename = os.path.basename(self.xyz_paths[counter-1]) + comp_cost = self.multiwfn.partitionCharge( + multipole_bool=use_multipole, + f=f, + multiwfn_path=multiwfn_path, + charge_type=charge_type, + owd=owd, + molden_filename=molden_filename, + xyz_filename=xyz_filename + ) - if comp_cost == -1: - print(f"WARNING: Charge calculation failed for {charge_type} in {f}") - continue + if comp_cost == -1: + print(f"WARNING: Charge calculation failed for {charge_type} in {f}") + continue path_to_xyz = file_path_xyz # Get point charge path for this job @@ -2283,7 +2449,19 @@ def getESP(self, charge_types='CM5', ESPdata_filename='ESP', multiwfn_path='None """) # Calculate ESP using appropriate method - if use_multipole: + if use_pdb_charges: + # pdb_only: compute ESP directly from the PDB charge DataFrame + [ESP_all, atom_type] = self.monopole_esp_df( + df_pdb_charges, atom_idx, all_lines + ) + df_pdb_reset = df_pdb_charges.reset_index(drop=True) + total_charge = df_pdb_reset['charge'].sum() + partial_charge_atom = df_pdb_reset['charge'].iloc[atom_idx] + results_dict['Atoms'] = atom_type + results_dict['Total Charge'] = total_charge + results_dict[f'Partial Charge pdb'] = partial_charge_atom + results_dict[f'ESP pdb'] = ESP_all + elif use_multipole: [final_esp, atom_name] = self.multipole_esp( file_path_xyz, file_path_multipole, all_lines, atom_idx, ptchg_path=ptchg_path ) @@ -2477,6 +2655,9 @@ def getEfield(self, charge_types='Hirshfeld_I', Efielddata_filename='ef', multiw >>> # Monopole mode with atom-wise decomposition >>> es.getEfield('Hirshfeld', 'efield', ..., multipole_bool=False, save_atomwise_decomposition=True) """ + # Resolve pdb_charge_paths: function-level arg takes priority, else use object-level + if pdb_charge_paths is None and self.pdb_only: + pdb_charge_paths = self.pdb_charge_paths use_pdb_charges = pdb_charge_paths is not None if use_pdb_charges and multipole_bool: @@ -2536,8 +2717,15 @@ def getEfield(self, charge_types='Hirshfeld_I', Efielddata_filename='ef', multiw results_dict = {} # Use stored absolute path instead of constructing it file_path_xyz = self.xyz_paths[counter] - total_lines = Electrostatics.mapcount(file_path_xyz) - init_all_lines = range(0, total_lines - 2) + + # In pdb_only mode xyz_paths[counter] is None; defer total_lines/all_lines + # until after the PDB charge file has been loaded (see below). + if not use_pdb_charges: + total_lines = Electrostatics.mapcount(file_path_xyz) + init_all_lines = range(0, total_lines - 2) + else: + total_lines = None # set after loading PDB charges + init_all_lines = None # Handle excludeAtomfromEcalc - support both flat list and nested list exclude_atoms = self.config.get('excludeAtomfromEcalc', []) @@ -2547,7 +2735,11 @@ def getEfield(self, charge_types='Hirshfeld_I', Efielddata_filename='ef', multiw else: # Flat list/array: same exclusions for all folders exclude_list = list(exclude_atoms) if hasattr(exclude_atoms, '__iter__') else [] - all_lines = [x for x in init_all_lines if x not in exclude_list] + + if not use_pdb_charges: + all_lines = [x for x in init_all_lines if x not in exclude_list] + else: + all_lines = None # set after loading PDB charges # Determine bond indices if auto_find_bonds or (not input_bond_indices): @@ -2725,12 +2917,10 @@ def getEfield(self, charge_types='Hirshfeld_I', Efielddata_filename='ef', multiw """) df_pdb_charges = self.getPdbCharges(pdb_charge_path) - total_atoms = total_lines - 2 - if len(df_pdb_charges) != total_atoms: - raise ValueError( - f"PDB charge count mismatch for {pdb_charge_path}: " - f"{len(df_pdb_charges)} charges vs {total_atoms} atoms in {file_path_xyz}" - ) + # In pdb_only mode the atom count comes from the PDB itself + total_lines = len(df_pdb_charges) + 2 + init_all_lines = range(0, len(df_pdb_charges)) + all_lines = [x for x in init_all_lines if x not in exclude_list] # Calculate E-field with atom-wise decomposition if use_pdb_charges: @@ -4164,7 +4354,8 @@ def getElectrostatic_stabilization(self, multiwfn_path=None, save_atomwise_decomposition=False, visualize=None, multipole_order=2, substrate_multipole_order=None, env_multipole_order=None, dielectric=1, dielectric_scale=1, - includePtChgs=None, ptchg_paths=None): + includePtChgs=None, ptchg_paths=None, + pdb_charge_paths=None): """ Compute electrostatic stabilization using AMOEBA-style multipole tensor formalism. @@ -4227,6 +4418,11 @@ def getElectrostatic_stabilization(self, multiwfn_path=None, List of paths (one per job), with None for jobs without point charges. Point charges are treated as monopole-only environment atoms. If provided, takes precedence over object-level settings (default: None). + pdb_charge_paths : str or list of str or None, optional + If provided, use partial charges from PDB B-factor columns only (no Multiwfn). + Monopole-only (multipole_order is forced to 1). Coordinates are also read from + the PDB, so no separate xyz file is needed. + Overrides the object-level pdb_charge_paths when the object was created in pdb_only mode. Returns: -------- @@ -4303,6 +4499,17 @@ def getElectrostatic_stabilization(self, multiwfn_path=None, ... env_multipole_order=1 # MM: charges only ... ) """ + # Resolve pdb_charge_paths: function-level arg takes priority, else use object-level + if pdb_charge_paths is None and self.pdb_only: + pdb_charge_paths = self.pdb_charge_paths + use_pdb_charges = pdb_charge_paths is not None + + # In pdb_only mode, only monopole interactions are supported + if use_pdb_charges: + multipole_order = 1 + substrate_multipole_order = 1 + env_multipole_order = 1 + # Validate required parameters if substrate_idxs is None or (isinstance(substrate_idxs, list) and len(substrate_idxs) == 0): raise ValueError(f""" @@ -4327,8 +4534,9 @@ def getElectrostatic_stabilization(self, multiwfn_path=None, {'='*60} """) - # Validate charge type - validate_charge_type(charge_type, context="getElectrostatic_stabilization") + # Validate charge type (skip in pdb_only mode — no Multiwfn scheme needed) + if not use_pdb_charges: + validate_charge_type(charge_type, context="getElectrostatic_stabilization") # Validate multipole orders validate_numeric_range(multipole_order, "multipole_order", allowed_values=[1, 2, 3], @@ -4393,78 +4601,116 @@ def getElectrostatic_stabilization(self, multiwfn_path=None, results_dict = {} # Use stored absolute path instead of constructing it file_path_xyz = self.xyz_paths[counter] - total_lines = Electrostatics.mapcount(file_path_xyz) - init_all_lines = np.arange(0, total_lines - 2) + + if use_pdb_charges: + # pdb_only: load everything from the PDB file + if isinstance(pdb_charge_paths, list): + pdb_charge_path = pdb_charge_paths[counter] if counter < len(pdb_charge_paths) else None + else: + pdb_charge_path = pdb_charge_paths + if pdb_charge_path is None: + print(f"Warning: No PDB charge file for job {counter}, skipping") + counter += 1 + continue + pdb_charge_path = os.path.abspath(pdb_charge_path) + if not os.path.exists(pdb_charge_path): + raise FileNotFoundError(f"PDB charge file not found: {pdb_charge_path}") + df_pdb = self.getPdbCharges(pdb_charge_path) + n_atoms = len(df_pdb) + init_all_lines = np.arange(0, n_atoms) + + # Build df_all (monopole-only) from PDB DataFrame + df_all = df_pdb.rename(columns={'Atom': 'Element', 'charge': 'Atom_Charge'}).copy() + df_all["Index"] = range(0, n_atoms) + df_all['Dipole_Moment'] = [[0.0, 0.0, 0.0]] * n_atoms + df_all['Quadrupole_Moment'] = [np.zeros((3, 3))] * n_atoms + + # Build df_geom from PDB coordinates (same column names as Geometry.getGeomInfo()) + df_geom = df_pdb.rename(columns={'x': 'X', 'y': 'Y', 'z': 'Z'})[['Atom', 'X', 'Y', 'Z']].reset_index(drop=True) + df_geom.index = range(len(df_geom)) + + else: + total_lines = Electrostatics.mapcount(file_path_xyz) + init_all_lines = np.arange(0, total_lines - 2) if not env_idxs: env_idx = [x for x in init_all_lines if x not in substrate_idx] else: env_idx = env_idxs[counter] - # Partition charges (with multipole analysis if needed) - # Extract actual filenames from paths - molden_filename = os.path.basename(self.molden_paths[counter]) - xyz_filename = os.path.basename(self.xyz_paths[counter]) - comp_cost = self.multiwfn.partitionCharge(need_multipoles, f, - multiwfn_path, charge_type, owd, - molden_filename=molden_filename, xyz_filename=xyz_filename) + if use_pdb_charges: + # Skip partitionCharge — df_all already built above + pass + else: + # Partition charges (with multipole analysis if needed) + # Extract actual filenames from paths + molden_filename = os.path.basename(self.molden_paths[counter]) + xyz_filename = os.path.basename(self.xyz_paths[counter]) + comp_cost = self.multiwfn.partitionCharge(need_multipoles, f, + multiwfn_path, charge_type, owd, + molden_filename=molden_filename, xyz_filename=xyz_filename) - if comp_cost == -1: - print(f"Warning: Charge calculation failed for {f}, skipping") - counter += 1 - continue + if comp_cost == -1: + print(f"Warning: Charge calculation failed for {f}, skipping") + counter += 1 + continue + + # Get geometry information + geom = Geometry(file_path_xyz) + df_geom = geom.getGeomInfo() + + if use_pdb_charges: + # pdb_only: df_all and df_geom already built above; just build multipole_dict + multipole_dict = {int(row['Index']): row for _, row in df_all.iterrows()} + multipole_available = False + else: + # Load charge/multipole data with automatic fallback + # Use directory path directly (f is already absolute path from lst_of_folders) + multipole_name = f"{f}/Multipole{charge_type}.txt" + monopole_name = f"{f}/Charges{charge_type}.txt" - # Get geometry information - geom = Geometry(file_path_xyz) - df_geom = geom.getGeomInfo() - - # Load charge/multipole data with automatic fallback - # Use directory path directly (f is already absolute path from lst_of_folders) - multipole_name = f"{f}/Multipole{charge_type}.txt" - monopole_name = f"{f}/Charges{charge_type}.txt" - - # Try to load multipole file first, fall back to charges if not available - multipole_available = os.path.exists(multipole_name) - - if need_multipoles and not multipole_available: - print(f"Warning: Multipole file not found for {f}, falling back to charges-only") - print(f" Expected path: {multipole_name}") - print(f" Requested multipole_order={multipole_order}, but only charges available") - print(f" Setting both substrate and environment to monopole-only") - substrate_multipole_order = 1 - env_multipole_order = 1 - need_multipoles = False - - if multipole_available and need_multipoles: - # Load multipole data - print(f"Loading multipole data from: {multipole_name}") - atomicDict = MultiwfnInterface.getmultipoles(multipole_name) - if not atomicDict: - print(f"Warning: Multipole file exists but parsing failed for {f}") - print(f" File path: {multipole_name}") - print(f" Falling back to charges-only mode") + # Try to load multipole file first, fall back to charges if not available + multipole_available = os.path.exists(multipole_name) + + if need_multipoles and not multipole_available: + print(f"Warning: Multipole file not found for {f}, falling back to charges-only") + print(f" Expected path: {multipole_name}") + print(f" Requested multipole_order={multipole_order}, but only charges available") + print(f" Setting both substrate and environment to monopole-only") substrate_multipole_order = 1 env_multipole_order = 1 need_multipoles = False - # Load charges instead + + if multipole_available and need_multipoles: + # Load multipole data + print(f"Loading multipole data from: {multipole_name}") + atomicDict = MultiwfnInterface.getmultipoles(multipole_name) + if not atomicDict: + print(f"Warning: Multipole file exists but parsing failed for {f}") + print(f" File path: {multipole_name}") + print(f" Falling back to charges-only mode") + substrate_multipole_order = 1 + env_multipole_order = 1 + need_multipoles = False + # Load charges instead + df_all = pd.read_csv(monopole_name, sep='\s+', + names=["Element", 'x', 'y', 'z', "Atom_Charge"]) + df_all["Index"] = range(0, len(df_all)) + df_all['Dipole_Moment'] = [[0.0, 0.0, 0.0]] * len(df_all) + df_all['Quadrupole_Moment'] = [np.zeros((3, 3))] * len(df_all) + multipole_dict = {int(row['Index']): row for _, row in df_all.iterrows()} + else: + df_all = pd.DataFrame(atomicDict) + df_all['Index'] = df_all['Index'].astype(int) - 1 + multipole_dict = {int(row['Index']): row for _, row in df_all.iterrows()} + else: + # Load charges-only data df_all = pd.read_csv(monopole_name, sep='\s+', names=["Element", 'x', 'y', 'z', "Atom_Charge"]) df_all["Index"] = range(0, len(df_all)) df_all['Dipole_Moment'] = [[0.0, 0.0, 0.0]] * len(df_all) df_all['Quadrupole_Moment'] = [np.zeros((3, 3))] * len(df_all) - multipole_dict = {int(row['Index']): row for _, row in df_all.iterrows()} - else: - df_all = pd.DataFrame(atomicDict) - df_all['Index'] = df_all['Index'].astype(int) - 1 - multipole_dict = {int(row['Index']): row for _, row in df_all.iterrows()} - else: - # Load charges-only data - df_all = pd.read_csv(monopole_name, sep='\s+', - names=["Element", 'x', 'y', 'z', "Atom_Charge"]) - df_all["Index"] = range(0, len(df_all)) - df_all['Dipole_Moment'] = [[0.0, 0.0, 0.0]] * len(df_all) - df_all['Quadrupole_Moment'] = [np.zeros((3, 3))] * len(df_all) - multipole_dict = {row['Index']: row for _, row in df_all.iterrows()} + multipole_dict = {row['Index']: row for _, row in df_all.iterrows()} # Now handle substrate and environment separately for QM/MM support df_substrate = df_all[df_all["Index"].isin(substrate_idx)].copy() @@ -4694,7 +4940,9 @@ def getElectrostatic_stabilization(self, multiwfn_path=None, # Create PDB file pdb_name = f'estab_{charge_type}_{structure_name}_tensor_sub{substrate_multipole_order}_env{env_multipole_order}.pdb' - if multipole_available and (substrate_multipole_order >= 2 or env_multipole_order >= 2): + if use_pdb_charges: + print(f" Visualization not supported in pdb_only mode; skipping PDB creation.") + elif multipole_available and (substrate_multipole_order >= 2 or env_multipole_order >= 2): Visualize(file_path_xyz).makePDB(multipole_name, b_factors, pdb_name) else: Visualize(file_path_xyz).makePDB(monopole_name, b_factors, pdb_name) diff --git a/pyef/resources/settings.ini b/pyef/resources/settings.ini index 09bd18a..d2f4bff 100644 --- a/pyef/resources/settings.ini +++ b/pyef/resources/settings.ini @@ -99,7 +99,7 @@ outmedinfo= 0 // Output some intermediate information, mainly for debugging purpose; 0/1; 0 iwfntmptype= 1 // When atomic wavefunction files are needed to be generated, =1 delete the old "wfntmp" folder (if presents) and then use the newly built "wfntmp" folder, =2 Build and use "wfntmp" folder with different suffix, e.g. wfntmp0001, wfntmp0002... so that you can simultaneously run multiple instances without conflict; 1/2; 1 iaddprefix= 0 // =1: Add name of inputted file as prefix of exported file =0: Do not add; 0/1; 0 - ispecial= 0 // Sob only knows. 1=For RCY, 2=For shubin's 2nd project, Hirshfeld density as reference, or obtain Renyi entropy, 0/1/2; 0 + ispecial= 1 // Sob only knows. 1=For RCY, 2=For shubin's 2nd project, Hirshfeld density as reference, or obtain Renyi entropy, 0/1/2; 0 //The last opened file name (There must be a space line in present file) lastfile= D:\CM\my_program\Multiwfn\examples\excit\NH2_C8_NO2\NH2_C8_NO2.fchk diff --git a/pyef/utility.py b/pyef/utility.py index 073771c..799b937 100644 --- a/pyef/utility.py +++ b/pyef/utility.py @@ -387,23 +387,30 @@ def build_hybrid_core_map(self, name, cutoff_Z, heavy_family, base_maps): def fix_ECPmolden(self, owd, ECP): - """Prepares output terachem data for analysis, mainly isolating final .xyz frame and naming .molden file appropriotely""" + """Fix ECP artifacts in TeraChem Molden files for Multiwfn compatibility. + + Performs three corrections: + 1. Replaces full atomic numbers with valence electron counts for ECP atoms + 2. Adds [Pseudo] section listing ECP atoms (required by Multiwfn) + 3. Removes any incorrectly added [5d]/[7f]/[9g] tags (TeraChem writes + Cartesian basis functions which is the Molden default) + """ from .geometry import Geometry # Import here to avoid circular dependency FAMILIES = ["lanl2dz", "stuttgart_rsc", "def2", "crenbl"] # Hybrid families with cutoff rules HYBRID_FAMILIES = { "lacvps": { - "cutoff_Z": 18, # Z <= 18 ?~F~R all-electron + "cutoff_Z": 18, # Z <= 18 → all-electron "heavy_family": "lanl2dz"}, "lacvp": { - "cutoff_Z": 10, # Z <= 10 ?~F~R all-electron + "cutoff_Z": 10, # Z <= 10 → all-electron "heavy_family": "lanl2dz" } } #check if the ECP json already exists... if not you will need to generate it using the funcitonality above json_path = owd + '/' + ECP + "ecp_core_maps.json" - + # Check if the file exists if os.path.exists(json_path) and os.path.isfile(json_path): pass @@ -433,9 +440,9 @@ def fix_ECPmolden(self, owd, ECP): base_maps[ECP] = self.build_core_map(ECP) with open(json_path, "w") as f: json.dump(base_maps, f, indent=2) - - #Now the json should be establish - + + #Now the json should be establish + with open(json_path, "r") as f: print(f' >loading {ECP} parameters') ecp_dict = json.load(f)[ECP] @@ -450,15 +457,72 @@ def fix_ECPmolden(self, owd, ECP): Z = self.periodic_table[elem] new_Z = Z - ecp_dict[elem] change_dict[elem] = new_Z - + with open(self.moldenFile, 'r') as file: - content = file.read() - for change_elem, new_Z in change_dict.items(): - print(f' Molden file at {self.moldenFile} changed: {change_elem} non-core electrons set to {new_Z}') - pattern = re.compile(rf'({change_elem}\s+\d+\s+)(\d+)') - content = pattern.sub(rf'\g<1>{new_Z}', content) - with open(self.moldenFile, 'w') as file: - file.write(content) + lines = file.readlines() + + # Check if fixes have already been applied (idempotency) + full_content = ''.join(lines) + has_pseudo_already = '[pseudo]' in full_content.lower() + + # --- Step 1: Fix atomic numbers in [Atoms] section --- + # Also collect ECP atom entries for the [Pseudo] section + pseudo_entries = [] # list of (symbol, atom_index, new_Z) + in_atoms = False + gto_line_idx = None + mo_line_idx = None + + for i, line in enumerate(lines): + stripped = line.strip() + if stripped.lower().startswith('[atoms]'): + in_atoms = True + continue + if stripped.lower().startswith('[pseudo]'): + in_atoms = False + continue + if stripped.lower().startswith('[gto]'): + in_atoms = False + gto_line_idx = i + continue + if stripped.lower().startswith('[mo]'): + mo_line_idx = i + continue + + if in_atoms and stripped: + parts = line.split() + if len(parts) >= 6: + sym = parts[0] + if sym in change_dict: + new_Z = change_dict[sym] + atom_idx = parts[1] + current_Z = int(parts[2]) + # Only replace if not already corrected + if current_Z != new_Z: + pattern = re.compile(rf'({re.escape(sym)}\s+{re.escape(atom_idx)}\s+)(\d+)') + lines[i] = pattern.sub(rf'\g<1>{new_Z}', line) + pseudo_entries.append((sym, atom_idx, new_Z)) + + for change_elem, new_Z in change_dict.items(): + print(f' Molden file at {self.moldenFile} changed: {change_elem} non-core electrons set to {new_Z}') + + # --- Step 2: Add [Pseudo] section before [GTO] --- + if pseudo_entries and gto_line_idx is not None and not has_pseudo_already: + pseudo_lines = ['[Pseudo]\n'] + for sym, atom_idx, new_Z in pseudo_entries: + pseudo_lines.append(f'{sym} {atom_idx} {new_Z}\n') + lines[gto_line_idx:gto_line_idx] = pseudo_lines + # Adjust mo_line_idx since we inserted lines before it + if mo_line_idx is not None: + mo_line_idx += len(pseudo_lines) + + # --- Step 3: Remove any incorrectly added [5d]/[7f]/[9g] tags --- + # TeraChem writes Cartesian basis functions in Molden format (6d/10f/15g), + # which is the Molden default. These tags should NOT be present. + lines = [l for l in lines if l.strip().lower() not in ('[5d]', '[7f]', '[9g]', + '[5d7f]', '[5d10f]')] + + with open(self.moldenFile, 'w') as file: + file.writelines(lines) # ============================================================================ @@ -554,6 +618,8 @@ def run_multiwfn(self, command, input_commands, output_file=None, If Multiwfn fails with non-zero exit code """ # Prepare the full command with output redirection if specified + # Prepend ulimit and KMP_STACKSIZE to prevent segfaults with large systems + command = f"ulimit -s unlimited; export KMP_STACKSIZE=200000000; {command}" full_command = command if output_file: full_command = f"{command} > {output_file}"