From 4a26255a053de693cea8f45cc867868e27d7467b Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Mon, 23 Feb 2026 22:45:36 +0100 Subject: [PATCH 01/15] adress #86 --- psiflow/data/dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index a2e866e..e332167 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -9,7 +9,7 @@ from parsl.app.app import join_app, python_app from parsl.app.python import PythonApp from parsl.data_provider.files import File -from parsl.dataflow.futures import AppFuture +from parsl.dataflow.futures import AppFuture, DataFuture import psiflow from psiflow.geometry import QUANTITIES, Geometry @@ -127,7 +127,7 @@ def __getitem__( ).outputs[0] return Dataset(None, extxyz) - def save(self, path: Union[Path, str]) -> AppFuture: + def save(self, path: Union[Path, str]) -> DataFuture: """ Save the dataset to a file. @@ -135,13 +135,14 @@ def save(self, path: Union[Path, str]) -> AppFuture: path: Path to save the dataset. Returns: - AppFuture: Future representing the completion of the save operation. + DataFuture: Future representing the file to which will be saved. """ path = psiflow.resolve_and_check(Path(path)) - _ = copy_data_future( + future = copy_data_future( inputs=[self.extxyz], outputs=[File(str(path))], ) + return future.outputs[0] def geometries(self) -> AppFuture: """ From 32324d62307bbb63f049c23639b9eb7dc3938a0f Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 25 Feb 2026 01:21:03 +0100 Subject: [PATCH 02/15] Update execution.py - working towards cleaner input configs - started moving more functionality into ExecutionDefinition - added lots of TODO things and quesions --- psiflow/execution.py | 845 ++++++++++++++++++++++--------------------- 1 file changed, 435 insertions(+), 410 deletions(-) diff --git a/psiflow/execution.py b/psiflow/execution.py index c632935..1e4b758 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -1,10 +1,10 @@ -from __future__ import annotations # necessary for type-guarding class methods - import logging -import math import re import shutil import sys +import warnings +import subprocess +from datetime import datetime, timedelta from dataclasses import dataclass from pathlib import Path from threading import Lock @@ -14,12 +14,10 @@ import parsl import psutil -import pytimeparse -import typeguard import yaml from parsl.config import Config from parsl.data_provider.files import File -from parsl.executors import ( # WorkQueueExecutor, +from parsl.executors import ( HighThroughputExecutor, ThreadPoolExecutor, WorkQueueExecutor, @@ -36,364 +34,481 @@ PSIFLOW_INTERNAL = "psiflow_internal" - -EXECUTION_KWARGS = ( - "container_uri", - "container_engine", - "container_addopts", - "container_entrypoint", -) +DEFAULT_CONFIG = { # TODO: remove + "ModelEvaluation": {"gpu": False, "use_threadpool": True}, + "ModelTraining": {"gpu": True, "use_threadpool": True}, +} @dataclass class ContainerSpec: + """Controls container configuration""" + uri: str engine: str = "apptainer" addopts: str = " --no-eval -e --no-mount home -W /tmp --writable-tmpfs" - entrypoint: str = "/opt/entry.sh" + gpu_flavour: str | None = None # TODO: add yaml argument def __post_init__(self): assert self.engine in ("apptainer", "singularity") assert len(self.uri) > 0 + assert self.gpu_flavour in ("cuda", "rocm", None) - def launch_command(self, gpu: bool = False) -> str: - # TODO: pretty sure some of this is overkill + def launch_command(self) -> str: pwd = Path.cwd().resolve() # access to data / internal dir args = [self.engine, "exec", self.addopts, f"--bind {pwd}"] - if gpu: - if "rocm" in self.uri: - args.append("--rocm") - else: - args.append("--nv") - args += [self.uri, self.entrypoint] + if self.gpu_flavour == "cuda": + args.append("--nv") + elif self.gpu_flavour == "rocm": + args.append("--rocm") return " ".join(args) @staticmethod - def from_kwargs(kwargs: dict) -> Optional[ContainerSpec]: + def from_kwargs(kwargs: dict) -> Optional["ContainerSpec"]: if "container_uri" not in kwargs: return None - keys = ( - "container_uri", - "container_engine", - "container_addopts", - "container_entrypoint", - ) + keys = ("container_uri", "container_engine", "container_addopts") args = [kwargs[key] for key in keys if key in kwargs] - return ContainerSpec(*args) # TODO: slightly hacky + return ContainerSpec(*args) + + +class ReferenceSpec(Protocol): + """Defines default options for Reference implementations""" + + name: ClassVar[str] + reference_args: ClassVar[tuple[str, ...]] + mpi_command: str + mpi_args: Iterable[str] + executable: str + + def launch_command(self) -> str: + raise NotImplementedError + + @classmethod + def from_kwargs(cls, **kwargs): + keys = ("mpi_command", "mpi_args", "executable") + return cls(**{k: kwargs[k] for k in keys if k in kwargs}) + + +@dataclass +class CP2KReferenceSpec(ReferenceSpec): + name = "CP2K" + reference_args = ("cores_per_worker",) + mpi_command: str = "mpirun -np {cores_per_worker}" + mpi_args: tuple[str, ...] = ( + "-ENV OMP_NUM_THREADS=1", + "--bind-to core", + "--map-by core", + ) + executable: str = "cp2k.psmp -i cp2k.inp" + + def launch_command(self): + return " ".join([self.mpi_command, *self.mpi_args, self.executable]) + + +@dataclass +class GPAWReferenceSpec(ReferenceSpec): + name = "GPAW" + reference_args = ("cores_per_worker",) + mpi_command: str = "mpirun -np {cores_per_worker}" + mpi_args: tuple[str, ...] = ( + "-x OMP_NUM_THREADS=1", + "--bind-to core", + "--map-by core", + ) + executable: str = "gpaw python script_gpaw.py input.json" + + def launch_command(self): + return " ".join([self.mpi_command, *self.mpi_args, self.executable]) + + +@dataclass +class ORCAReferenceSpec(ReferenceSpec): + name = "ORCA" + reference_args = () + mpi_command: str = "" + mpi_args: tuple[str, ...] = ( + "-x OMP_NUM_THREADS=1", + "--bind-to core", + "--map-by core", + ) + executable: str = "$(which orca) orca.inp" + + def launch_command(self): + mpi_str = " ".join(self.mpi_args) + return f'{self.executable} "{mpi_str}"' + + +REFERENCE_SPECS = { + "CP2K": CP2KReferenceSpec, + "GPAW": GPAWReferenceSpec, + "ORCA": ORCAReferenceSpec, +} + + +def str_to_timedelta(s: str) -> timedelta: + # TODO: move to utils + t = datetime.strptime(s, "%H:%M:%S") + return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) + + +def make_slurm_provider(kwargs: dict) -> tuple[SlurmProvider, dict]: + defaults = {"init_blocks": 0, "exclusive": False} + required = ("cores_per_node", "walltime", "gpus_per_node") + kwargs = defaults | kwargs + assert all(key in kwargs for key in required) + provider = SlurmProvider(**kwargs) # does not configure Launcher + resources = { + "nodes": provider.nodes_per_block, + "cores": provider.cores_per_node, + "memory": provider.mem_per_node, + "gpus": provider.gpus_per_node, + "lifetime": str_to_timedelta(provider.walltime).seconds, + } + return provider, resources + + +def make_local_provider(kwargs: dict) -> tuple[LocalProvider, dict]: + resources = { + "nodes": 1, + "cores": kwargs.get("cores", psutil.cpu_count()), + "memory": kwargs.get( + "memory", psutil.virtual_memory().available / 1e9 + ), # TODO: available? + "lifetime": float("inf"), + } + if "gpus" in kwargs: + resources["gpus"] = kwargs["gpus"] + else: + out = "" + try: + out = subprocess.check_output( + "nvidia-smi -L || amd-smi list", + shell=True, + text=True, + stderr=subprocess.DEVNULL, + ) + except subprocess.CalledProcessError: + pass # nvidia-sm and amd-smi not found TODO: not properly tested + resources["gpus"] = out.count("\n") + provider = LocalProvider(init_blocks=0) + return provider, resources -@typeguard.typechecked class ExecutionDefinition: + # TODO: do not like defining some kwargs in class method and other kwargs in init... def __init__( self, - parsl_provider: ExecutionProvider, - gpu: bool, - cores_per_worker: int, - use_threadpool: bool, + provider: ExecutionProvider | None, + executor_type: str, + executor_kwargs: dict, + resources: dict, container: Optional[ContainerSpec], - max_workers: Optional[int] = None, + max_runtime: str | None = None, + env_vars: Optional[dict[str, str]] = None, **kwargs, - ) -> None: - self.parsl_provider = parsl_provider - self.gpu = gpu - self.cores_per_worker = cores_per_worker - self.use_threadpool = use_threadpool + ): + self.provider = provider + self.executor_type = executor_type + self.kwargs = executor_kwargs + self.resources = resources # compute per node self.container = container - self._max_workers = max_workers + self.env_vars = env_vars or {} + + if self.use_gpu: + msg = "" + if resources["gpus"] == 0: + msg = "GPU usage requested but no GPUs available" + elif container is not None and container.gpu_flavour is None: + msg = "Provide 'gpu_flavour' to choose between CUDA and ROCM" + if msg: + raise ValueError(msg) + + # how long can individual tasks run (in seconds) + if max_runtime is None: + # allow some margin for task cleanup TODO: pretty random + max_runtime = max(0.9 * self.lifetime, self.lifetime - 60) + else: + max_runtime = str_to_timedelta(max_runtime).seconds + if max_runtime != float("inf") and max_runtime >= self.lifetime: + warnings.warn( + "Allowed task runtime exceeds provider walltime. Tasks might get killed by the scheduler." + ) + self.max_runtime = max_runtime + + # TODO: check that WQ kwargs do not exceed resources? + # TODO: how to handle env variables? + pass @property def name(self) -> str: return self.__class__.__name__ @property - def cores_available(self): - if type(self.parsl_provider) is LocalProvider: # noqa: F405 - cores_available = psutil.cpu_count(logical=False) - elif type(self.parsl_provider) is SlurmProvider: - cores_available = self.parsl_provider.cores_per_node - else: - cores_available = float("inf") - return cores_available - - @property - def max_workers(self): - if self._max_workers is not None: - return self._max_workers - else: - return max(1, math.floor(self.cores_available / self.cores_per_worker)) + def lifetime(self) -> float: + """How long will this manager survive (in seconds)""" + return self.resources["lifetime"] @property - def max_runtime(self): - if type(self.parsl_provider) is SlurmProvider: - walltime = pytimeparse.parse(self.parsl_provider.walltime) - else: - walltime = 1e9 - return walltime + def use_gpu(self) -> bool: + return self.kwargs.get("use_gpu") or self.kwargs.get("gpus_per_task") > 0 + + def wrap_in_timeout(self, command: str) -> str: + if self.max_runtime == float("inf"): + return command # noop + + # send SIGTERM after max_runtime, follow with SIGKILL 30s later + return f"timeout -k 30s {self.max_runtime}s {command}" + + def _create_threadpool(self, path: Path) -> ThreadPoolExecutor: + max_threads = self.kwargs["max_threads"] + return ThreadPoolExecutor(self.name, max_threads, working_dir=str(path)) + + def _create_workqueue(self, path: Path) -> WorkQueueExecutor: + """See https://cctools.readthedocs.io/en/latest/man_pages/work_queue_worker/#synopsis""" + + # ensure proper scale in # TODO: why is this needed? + timeout = int(1e6) if self.resources["nodes"] > 1 else 20 + cores = self.resources["cores"] + + worker_options = [ + "--parent-death", + f"--cores={cores}", + f"--timeout={timeout}", + ] + if (memory := self.resources["memory"]) is not None: + worker_options.append(f"--memory={memory * 1000}") # in MB + if (lifetime := self.lifetime) != float("inf"): + # allow some margin for WQ startup + walltime = max(0.95 * lifetime, lifetime - 30) + worker_options.append(f"-wall-time={walltime}") + if self.use_gpu: + gpus = self.resources["gpus"] + worker_options.append(f"--gpus={gpus}") + + worker_executable = "work_queue_worker" + if not isinstance(self, ReferenceEvaluation) and self.container: + # ModelEvaluation / ModelTraining run in container themselves + # Reference instances launch tasks in container + prepend = self.container.launch_command() + worker_executable = f"{prepend} {worker_executable}" + + # TODO: why the custom WQ? + executor = MyWorkQueueExecutor( + label=self.name, + working_dir=str(path / self.name), + provider=self.provider, + shared_fs=True, + # autocategory=False, + # port=0, + # max_retries=1, + # coprocess=False, + worker_options=" ".join(worker_options), + worker_executable=worker_executable, + scaling_cores_per_worker=cores, + ) + return executor def create_executor(self, path: Path) -> ParslExecutor: - if self.use_threadpool: - executor = ThreadPoolExecutor( - max_threads=self.cores_per_worker, - working_dir=str(path), - label=self.name, - ) - else: - cores = self.max_workers * self.cores_per_worker - worker_options = [ - "--parent-death", - "--wall-time={}".format(self.max_runtime), - "--cores={}".format(cores), - ] - if self.gpu: - worker_options.append("--gpus={}".format(self.max_workers)) - - # ensure proper scale in - if getattr(self.parsl_provider, "nodes_per_block", 1) > 1: - worker_options.append("--idle-timeout={}".format(int(1e6))) - else: - worker_options.append("--idle-timeout={}".format(20)) + if self.executor_type == "threadpool": + return self._create_threadpool(path) + return self._create_workqueue(path) - # only ModelEvaluation / ModelTraining / default_htex run in containers - if not isinstance(self, ReferenceEvaluation) and self.container: - prepend = self.container.launch_command(self.gpu) - worker_executable = f"{prepend} work_queue_worker" - else: - worker_executable = "work_queue_worker" - - executor = MyWorkQueueExecutor( - label=self.name, - working_dir=str(path / self.name), - provider=self.parsl_provider, - shared_fs=True, - autocategory=False, - port=0, - max_retries=1, - coprocess=False, - worker_options=" ".join(worker_options), - worker_executable=worker_executable, - scaling_cores_per_worker=cores, - ) - return executor + def wq_resources(self, *args, **kwargs) -> dict: + if self.executor_type == "threadpool": + return {} + + # TODO: why recreate every call? + # TODO: priority + spec = { + "cores": self.kwargs["cores_per_task"], + "memory": int(self.kwargs["mem_per_task"] * 1000), # in MB + "gpus": self.kwargs["gpus_per_task"], + "disk": 0, # not implemented + "running_time_min": self.kwargs["min_runtime"], + } + return self._modify_wq_resources(spec, *args, **kwargs) + + def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: + raise NotImplementedError @classmethod def from_config( cls, - gpu: bool = False, - cores_per_worker: int = 1, - use_threadpool: bool = False, + executor: str = "workqueue", container: Optional[ContainerSpec] = None, **kwargs, ): - # search for any section in the config which defines the Parsl ExecutionProvider - # if none are found, default to LocalProvider - # currently only checking for SLURM - if "slurm" in kwargs: - provider_cls = SlurmProvider - provider_kwargs = kwargs.pop("slurm") # do not allow empty dict - provider_kwargs["init_blocks"] = 0 - provider_kwargs.setdefault("exclusive", False) + if executor == "threadpool": + assert container is None, "Threadpool not compatible with containers" + assert ( + "slurm" not in kwargs + ), "Threadpool not compatible with remote execution" + assert "max_threads" in kwargs, "Specify 'max_threads' for parallelism" + executor_kwargs = { + "max_threads": kwargs["max_threads"], + "use_gpu": kwargs.get("use_gpu", False), + } + elif executor == "workqueue": + executor_kwargs = { + "cores_per_task": kwargs.get("cores_per_task", 0), + "gpus_per_task": kwargs.get("gpus_per_task", 0), + "mem_per_task": kwargs.get("mem_per_task", 0), + } + assert any(v != 0 for v in executor_kwargs.values()) + min_runtime = kwargs.get("min_runtime", "00:00:00") + executor_kwargs["min_runtime"] = str_to_timedelta(min_runtime).seconds else: - provider_cls = LocalProvider # noqa: F405 - provider_kwargs = kwargs.pop("local", {}) + raise ValueError("Key 'executor' must be 'threadpool' or 'workqueue'") - # if multi-node blocks are requested, make sure we're using SlurmProvider - if provider_kwargs.get("nodes_per_block", 1) > 1: - launcher = SlurmLauncher() + # search for Parsl ExecutionProvider block, defaulting to "local" + if "slurm" in kwargs: + # use SlurmLauncher if multi-node blocks are requested TODO: what does this fix? + provider, resources = make_slurm_provider(kwargs["slurm"]) + launcher = SlurmLauncher() if resources["nodes"] > 1 else SimpleLauncher() + provider.launcher = launcher else: - launcher = SimpleLauncher() - - if container is not None: - # TODO: why not exactly? - assert not use_threadpool + provider, resources = make_local_provider(kwargs.get("local", {})) + if executor == "threadpool": + provider = None # no provider needed - # initialize provider - parsl_provider = provider_cls( - launcher=launcher, - **provider_kwargs, - ) return cls( - parsl_provider=parsl_provider, - gpu=gpu, - use_threadpool=use_threadpool, + provider=provider, + executor_type=executor, + executor_kwargs=executor_kwargs, + resources=resources, container=container, - cores_per_worker=cores_per_worker, **kwargs, ) -@typeguard.typechecked class ModelEvaluation(ExecutionDefinition): def __init__( self, - max_simulation_time: Optional[float] = None, - timeout: float = (10 / 60), # 5 seconds - env_vars: Optional[dict[str, str]] = None, + timeout: float = 5, # TODO: units? **kwargs, - ) -> None: + ): super().__init__(**kwargs) - if max_simulation_time is not None: - assert max_simulation_time * 60 < self.max_runtime - self.max_simulation_time = max_simulation_time self.timeout = timeout - default_env_vars = { - "OMP_NUM_THREADS": str(self.cores_per_worker), - "KMP_AFFINITY": "granularity=fine,compact,1,0", - "KMP_BLOCKTIME": "1", - "OMP_PROC_BIND": "false", - "PYTHONUNBUFFERED": "TRUE", - } - if env_vars is None: - env_vars = default_env_vars - else: - default_env_vars.update(env_vars) - env_vars = default_env_vars - self.env_vars = env_vars - - def server_command(self): - command_list = ["psiflow-server"] - if self.max_simulation_time is not None: - max_time = 0.9 * (60 * self.max_simulation_time) - command_list = ["timeout -s 15 {}s".format(max_time), *command_list] - return " ".join(command_list) - - def client_command(self): - command_list = ["psiflow-client"] - return " ".join(command_list) - - # def get_client_args( - # self, - # hamiltonian_name: str, - # nwalkers: int, - # motion: str, - # ) -> list[str]: - # # TODO: redo this - # if "MACE" in hamiltonian_name: - # if motion in ["minimize", "vibrations"]: - # dtype = "float64" - # else: - # dtype = "float32" - # nclients = min(nwalkers, self.max_workers) - # if self.gpu: - # template = "--dtype={} --device=cuda:{}" - # args = [template.format(dtype, i) for i in range(nclients)] - # else: - # template = "--dtype={} --device=cpu" - # args = [template.format(dtype) for i in range(nclients)] - # return args - # else: - # return [""] + # TODO: temporary + self.cores_per_worker = self.kwargs.get("cores_per_task", 1) + self.gpu = False + + # TODO: what with env vars? + # default_env_vars = { + # "OMP_NUM_THREADS": str(self.cores_per_worker), + # "KMP_AFFINITY": "granularity=fine,compact,1,0", + # "KMP_BLOCKTIME": "1", + # "OMP_PROC_BIND": "false", + # "PYTHONUNBUFFERED": "TRUE", + # } + + def server_command(self) -> str: + command = "psiflow-server" + return self.wrap_in_timeout(command) def get_driver_devices(self, nwalkers: int) -> list[dict]: # assumes driver is GPU capable - # TODO: what if only 1 gpu is available? - nclients = min(nwalkers, self.max_workers) + # TODO: what if only 1 gpu is available? Redo this + # nclients = min(nwalkers, self.max_workers) + nclients = min(nwalkers, 2) if self.gpu: - return [{'device': f'cuda:{i}'} for i in range(nclients)] + return [{"device": f"cuda:{i}"} for i in range(nclients)] else: - return [{'device': 'cpu'} for _ in range(nclients)] - - def wq_resources(self, nwalkers): - if self.use_threadpool: - return {} - nclients = min(nwalkers, self.max_workers) - resource_specification = {} - resource_specification["cores"] = nclients * self.cores_per_worker - resource_specification["disk"] = 1000 # some random nontrivial amount? - memory = 2000 * self.cores_per_worker # similarly rather random - resource_specification["memory"] = int(memory) - resource_specification["running_time_min"] = self.max_simulation_time - if self.gpu: - resource_specification["gpus"] = nclients - return resource_specification + return [{"device": "cpu"} for _ in range(nclients)] + + def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: + pass + + # def wq_resources(self, nwalkers): + # if self.use_threadpool: + # return {} + # nclients = min(nwalkers, self.max_workers) + # resource_specification = {} + # resource_specification["cores"] = nclients * self.cores_per_worker + # resource_specification["disk"] = 1000 # some random nontrivial amount? + # memory = 2000 * self.cores_per_worker # similarly rather random + # resource_specification["memory"] = int(memory) + # resource_specification["running_time_min"] = self.max_simulation_time + # if self.gpu: + # resource_specification["gpus"] = nclients + # return resource_specification -@typeguard.typechecked class ModelTraining(ExecutionDefinition): def __init__( self, - gpu=True, - max_training_time: Optional[float] = None, - env_vars: Optional[dict[str, str]] = None, - multigpu: bool = False, + multigpu: bool = False, # TODO: how to handle this? **kwargs, ) -> None: - super().__init__(gpu=gpu, **kwargs) - assert self.gpu - if max_training_time is not None: - assert max_training_time * 60 < self.max_runtime - self.max_training_time = max_training_time + super().__init__(**kwargs) self.multigpu = multigpu if self.multigpu: + # TODO: why? Think this might be a multinode thing - which I do not care about message = ( "the max_training_time keyword does not work " "in combination with multi-gpu training. Adjust " "the maximum number of epochs to control the " "duration of training" ) - assert self.max_training_time is None, message - - default_env_vars = { - "OMP_NUM_THREADS": str(self.cores_per_worker), - "KMP_AFFINITY": "granularity=fine,compact,1,0", - "KMP_BLOCKTIME": "1", - "OMP_PROC_BIND": "spread", # different from Model Eval - "PYTHONUNBUFFERED": "TRUE", - } - if env_vars is None: - env_vars = default_env_vars - else: - default_env_vars.update(env_vars) - env_vars = default_env_vars - self.env_vars = env_vars + assert self.max_runtime is None, message + + # default_env_vars = { + # "OMP_NUM_THREADS": str(self.cores_per_worker), + # "KMP_AFFINITY": "granularity=fine,compact,1,0", + # "KMP_BLOCKTIME": "1", + # "OMP_PROC_BIND": "spread", # different from Model Eval + # "PYTHONUNBUFFERED": "TRUE", + # } + # if env_vars is None: + # env_vars = default_env_vars + # else: + # default_env_vars.update(env_vars) + # env_vars = default_env_vars def train_command(self, initialize: bool = False): - # script = "$(python -c 'import psiflow.models.mace_utils; print(psiflow.models.mace_utils.__file__)')" - command_list = ["psiflow-mace-train"] - if (self.max_training_time is not None) and not initialize: - max_time = 0.9 * (60 * self.max_training_time) - command_list = ["timeout -s 15 {}s".format(max_time), *command_list] - return " ".join(command_list) - - def wq_resources(self): - if self.use_threadpool: - return {} - resource_specification = {} - - if self.multigpu: - nworkers = int(self.cores_available / self.cores_per_worker) - else: - nworkers = 1 - - resource_specification["gpus"] = nworkers # one per GPU - resource_specification["cores"] = self.cores_available - resource_specification["disk"] = ( - 1000 * nworkers - ) # some random nontrivial amount? - memory = 1000 * self.cores_available # similarly rather random - resource_specification["memory"] = int(memory) - resource_specification["running_time_min"] = self.max_training_time - return resource_specification + command = "psiflow-mace-train" + return self.wrap_in_timeout(command) + + def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: + pass + + # def wq_resources(self): + # if self.use_threadpool: + # return {} + # resource_specification = {} + # + # if self.multigpu: + # nworkers = int(self.cores_available / self.cores_per_worker) + # else: + # nworkers = 1 + # + # resource_specification["gpus"] = nworkers # one per GPU + # resource_specification["cores"] = self.cores_available + # resource_specification["disk"] = ( + # 1000 * nworkers + # ) # some random nontrivial amount? + # memory = 1000 * self.cores_available # similarly rather random + # resource_specification["memory"] = int(memory) + # resource_specification["running_time_min"] = self.max_training_time + # return resource_specification -@typeguard.typechecked class ReferenceEvaluation(ExecutionDefinition): def __init__( self, - spec: ReferenceSpec, - max_evaluation_time: Optional[float] = None, - memory_limit: Optional[str] = None, + spec: "ReferenceSpec", + memory_limit: Optional[str] = None, # TODO: how does this work? **kwargs, ) -> None: # TODO: how to know which code? super().__init__(**kwargs) self.spec = spec - self.max_evaluation_time = max_evaluation_time * 60 # seconds - if max_evaluation_time: - assert 0 < max_evaluation_time < self.max_runtime self.memory_limit = memory_limit def command(self): + # TODO: this does not work probably launch_command = self.spec.launch_command() kwargs = {k: getattr(self, k) for k in self.spec.reference_args} launch_command = launch_command.format(**kwargs) @@ -401,10 +516,7 @@ def command(self): if self.container is not None: launch_command = f"{self.container.launch_command()} {launch_command}" - if (max_time := self.max_evaluation_time) is None: - # leave some slack for startup and cleanup - max_time = max(0.9 * self.max_runtime, self.max_runtime - 5) - launch_command = f"timeout -s 9 {max_time}s {launch_command}" + launch_command = self.wrap_in_timeout(launch_command) commands = [] if self.memory_limit is not None: @@ -423,85 +535,14 @@ def parse_size(size): # TODO: to utils? # exit code 0 so parsl always thinks bash app succeeded return "\n".join([*commands, launch_command, "exit 0"]) - def wq_resources(self): - if self.use_threadpool: - return {} - resource_specification = {} - resource_specification["cores"] = self.cores_per_worker - resource_specification["disk"] = 1000 # some random nontrivial amount? - memory = 2000 * self.cores_per_worker # similarly rather random - resource_specification["memory"] = int(memory) - resource_specification["running_time_min"] = self.max_evaluation_time - return resource_specification + def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: + return spec @property def name(self) -> str: return self.spec.name -class ReferenceSpec(Protocol): - name: ClassVar[str] - reference_args: ClassVar[tuple[str, ...]] - mpi_command: str - mpi_args: Iterable[str] - executable: str - - def launch_command(self) -> str: - raise NotImplementedError - - -@dataclass -class CP2KReferenceSpec(ReferenceSpec): - name = "CP2K" - reference_args = ("cores_per_worker",) - mpi_command: str = "mpirun -np {cores_per_worker}" - mpi_args: tuple[str, ...] = ( - "-ENV OMP_NUM_THREADS=1", - "--bind-to core", - "--map-by core", - ) - executable: str = "cp2k.psmp -i cp2k.inp" - - def launch_command(self): - # use nprocs = ncores, nthreads = 1 - return " ".join([self.mpi_command, *self.mpi_args, self.executable]) - - -@dataclass -class GPAWReferenceSpec(ReferenceSpec): - name = "GPAW" - reference_args = ("cores_per_worker",) - mpi_command: str = "mpirun -np {cores_per_worker}" - mpi_args: tuple[str, ...] = ( - "-x OMP_NUM_THREADS=1", - "--bind-to core", - "--map-by core", - ) - executable: str = "gpaw python script_gpaw.py input.json" - - def launch_command(self): - # use nprocs = ncores, nthreads = 1 - return " ".join([self.mpi_command, *self.mpi_args, self.executable]) - - -@dataclass -class ORCAReferenceSpec(ReferenceSpec): - name = "ORCA" - reference_args = () - mpi_command: str = "" - mpi_args: tuple[str, ...] = ( - "-x OMP_NUM_THREADS=1", - "--bind-to core", - "--map-by core", - ) - executable: str = "$(which orca) orca.inp" - - def launch_command(self): - mpi_str = " ".join(self.mpi_args) - return f'{self.executable} "{mpi_str}"' - - -@typeguard.typechecked class ExecutionContext: """ Psiflow centralizes all execution-level configuration options using an ExecutionContext. @@ -513,7 +554,6 @@ class ExecutionContext: and QM evaluation apps. As such, we ensure that execution-side details are strictly separated from the definition of the computational graph itself. For more information, check out the psiflow documentation regarding execution. - """ def __init__( @@ -540,15 +580,15 @@ def __exit__(self, exc_type, exc_value, traceback): parsl.dfk().cleanup() def new_file(self, prefix: str, suffix: str) -> File: + assert prefix[-1] == "_" + assert suffix[0] == "." + padding = 6 with self.lock: - assert prefix[-1] == "_" - assert suffix[0] == "." key = (prefix, suffix) if key not in self.file_index.keys(): self.file_index[key] = 0 - padding = 6 assert self.file_index[key] < (16**padding) - identifier = "{0:0{1}x}".format(self.file_index[key], padding) + identifier = f"{self.file_index[key]:0{padding}x}" self.file_index[key] += 1 return File(str(self.path / (prefix + identifier + suffix))) @@ -562,25 +602,22 @@ def from_config( max_idletime: float = 20, internal_tasks_max_threads: int = 10, default_threads: int = 4, - htex_address: str = "127.0.0.1", + # htex_address: str = "127.0.0.1", zip_staging: Optional[bool] = None, make_symlinks: bool = False, **kwargs, - ) -> ExecutionContext: + ) -> "ExecutionContext": path = Path.cwd().resolve() / PSIFLOW_INTERNAL - psiflow.resolve_and_check(path) if path.exists(): shutil.rmtree(path) - path.mkdir(parents=True, exist_ok=True) - parsl.set_file_logger( - filename=str(path / "parsl.log"), - name="parsl", - level=getattr(logging, parsl_log_level), - ) + path.mkdir(parents=True) + + log_file = str(path / "parsl.log") + log_level = getattr(logging, parsl_log_level) + parsl.set_file_logger(filename=log_file, name="parsl", level=log_level) # create definitions base_container = ContainerSpec.from_kwargs(kwargs) - kwargs.pop("container_uri", None) model_evaluation = ModelEvaluation.from_config( container=base_container, **kwargs.pop("ModelEvaluation", {}), @@ -589,12 +626,19 @@ def from_config( container=base_container, **kwargs.pop("ModelTraining", {"gpu": True}), # avoid triggering assertion ) + + # TODO: remove this and check below + model_evaluation.wq_resources(0) + model_evaluation.server_command() + model_training.wq_resources() + reference_evaluations = [] # reference evaluations might be class specific for key in list(kwargs.keys()): if key[:4] in REFERENCE_SPECS: # allow for e.g., CP2K_small config = kwargs.pop(key) reference_evaluation = ReferenceEvaluation.from_config( - spec=init_spec(REFERENCE_SPECS[key[:4]], config), + # spec=init_spec(REFERENCE_SPECS[key[:4]], config), + spec=REFERENCE_SPECS[key[:4]].from_kwargs(**config), container=ContainerSpec.from_kwargs(kwargs | config), **config, ) @@ -605,13 +649,14 @@ def from_config( executors = [d.create_executor(path=path) for d in definitions] # create default executors + # TODO: extract this into function if base_container is not None: launcher = WrappedLauncher(prepend=base_container.launch_command()) else: launcher = SimpleLauncher() htex = HighThroughputExecutor( label="default_htex", - address=htex_address, + # address=htex_address, working_dir=str(path / "default_htex"), cores_per_worker=1, max_workers_per_node=default_threads, @@ -640,7 +685,7 @@ def from_config( executors=executors, run_dir=str(path), initialize_logging=False, - app_cache=False, + # app_cache=False, usage_tracking=usage_tracking, retries=retries, strategy=strategy, @@ -650,16 +695,16 @@ def from_config( ) context = ExecutionContext(config, definitions, path / "context_dir") - if make_symlinks: - src, dest = Path.cwd() / "psiflow_log", path / "parsl.log" - _create_symlink(src, dest) - src, dest = ( - Path.cwd() / "psiflow_submit_scripts", - path / "000" / "submit_scripts", - ) - _create_symlink(src, dest, is_dir=True) - src, dest = Path.cwd() / "psiflow_task_logs", path / "000" / "task_logs" - _create_symlink(src, dest, is_dir=True) + # if make_symlinks: + # src, dest = Path.cwd() / "psiflow_log", path / "parsl.log" + # _create_symlink(src, dest) + # src, dest = ( + # Path.cwd() / "psiflow_submit_scripts", + # path / "000" / "submit_scripts", + # ) + # _create_symlink(src, dest, is_dir=True) + # src, dest = Path.cwd() / "psiflow_task_logs", path / "000" / "task_logs" + # _create_symlink(src, dest, is_dir=True) return context @@ -670,33 +715,24 @@ class ExecutionContextLoader: @classmethod def load( cls, - psiflow_config: Optional[dict[str, Any]] = None, + config: Optional[dict[str, Any]] = None, ) -> ExecutionContext: if cls._context is not None: raise RuntimeError("ExecutionContext has already been loaded") - if psiflow_config is None: # assume yaml is passed as argument - if len(sys.argv) == 1: # no config passed, use threadpools: - psiflow_config = { - "ModelEvaluation": { - "gpu": False, - "use_threadpool": True, - }, - "ModelTraining": { - "gpu": True, - "use_threadpool": True, - }, - } + if config is None: + if len(sys.argv) == 1: # no yaml config passed, use threadpools: + config = DEFAULT_CONFIG else: assert len(sys.argv) == 2 path_config = psiflow.resolve_and_check(Path(sys.argv[1])) assert path_config.exists() assert path_config.suffix in [".yaml", ".yml"], ( - "the execution configuration needs to be specified" - " as a YAML file, but got {}".format(path_config) + f"the execution configuration needs to be specified" + f" as a YAML file, but got {path_config}" ) with open(path_config, "r") as f: - psiflow_config = yaml.safe_load(f) - cls._context = ExecutionContext.from_config(**psiflow_config) + config = yaml.safe_load(f) + cls._context = ExecutionContext.from_config(**config) return cls._context @classmethod @@ -711,6 +747,7 @@ def wait(cls): class SlurmLauncher(Launcher): + # TODO: what does this do? def __init__(self, debug: bool = True, overrides: str = ""): super().__init__(debug=debug) self.overrides = overrides @@ -746,29 +783,17 @@ def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> s class MyWorkQueueExecutor(WorkQueueExecutor): + # TODO: what does this do? def _get_launch_command(self, block_id): return self.worker_command -def _create_symlink(src: Path, dest: Path, is_dir: bool = False) -> None: - """Create or replace symbolic link""" - if src.is_symlink(): - src.unlink() - if is_dir: - dest.mkdir(parents=True, exist_ok=True) - else: - dest.touch(exist_ok=True) - src.symlink_to(dest, target_is_directory=is_dir) - - -REFERENCE_SPECS = { - "CP2K": CP2KReferenceSpec, - "GPAW": GPAWReferenceSpec, - "ORCA": ORCAReferenceSpec, -} - - -def init_spec(spec_cls: type(ReferenceSpec), kwargs: dict) -> ReferenceSpec: - keys = ("mpi_command", "mpi_args", "executable") - cls_kwargs = {k: kwargs[k] for k in keys if k in kwargs} - return spec_cls(**cls_kwargs) +# def _create_symlink(src: Path, dest: Path, is_dir: bool = False) -> None: +# """Create or replace symbolic link""" +# if src.is_symlink(): +# src.unlink() +# if is_dir: +# dest.mkdir(parents=True, exist_ok=True) +# else: +# dest.touch(exist_ok=True) +# src.symlink_to(dest, target_is_directory=is_dir) From ffc0c11862bd8aa0d14421bf97f3c381246d6641 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Fri, 27 Feb 2026 16:39:45 +0100 Subject: [PATCH 03/15] implement very basic WQ priority handling WQ priority can be controlled using the context manager `with SetWQPriority:` which will set the 'priority' resource_spec argument. Very basic implementation, and we will need to be careful with how `wq_resources` is called and used --- psiflow/execution.py | 167 +++++++++++++++++++++++++++++++------------ 1 file changed, 120 insertions(+), 47 deletions(-) diff --git a/psiflow/execution.py b/psiflow/execution.py index 1e4b758..4ec0c70 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -28,16 +28,11 @@ from parsl.providers import LocalProvider, SlurmProvider from parsl.providers.base import ExecutionProvider -import psiflow logger = logging.getLogger(__name__) # logging per module -PSIFLOW_INTERNAL = "psiflow_internal" -DEFAULT_CONFIG = { # TODO: remove - "ModelEvaluation": {"gpu": False, "use_threadpool": True}, - "ModelTraining": {"gpu": True, "use_threadpool": True}, -} +PSIFLOW_INTERNAL = "psiflow_internal" # TODO: move configuration files somewhere @dataclass @@ -220,8 +215,22 @@ def __init__( if resources["gpus"] == 0: msg = "GPU usage requested but no GPUs available" elif container is not None and container.gpu_flavour is None: - msg = "Provide 'gpu_flavour' to choose between CUDA and ROCM" + msg = "Provide container 'gpu_flavour' to choose between CUDA and ROCM" + if msg: + raise ValueError(msg) + + if self.executor_type == "workqueue": + # WQ-specific checks TODO: check that WQ kwargs do not exceed resources? + msg = "" + if self.kwargs["gpus_per_task"] > resources["gpus"]: + msg = "GPUs" + if self.kwargs["cores_per_task"] > resources["cores"]: + msg = "cores" + if self.kwargs["mem_per_task"] > (resources["memory"] or float("inf")): + # TODO: do we need memory=None anywhere? otherwise default to inf? + msg = "memory" if msg: + msg = f"Apps will request more {msg} than available per Parsl block" raise ValueError(msg) # how long can individual tasks run (in seconds) @@ -231,13 +240,24 @@ def __init__( else: max_runtime = str_to_timedelta(max_runtime).seconds if max_runtime != float("inf") and max_runtime >= self.lifetime: - warnings.warn( - "Allowed task runtime exceeds provider walltime. Tasks might get killed by the scheduler." - ) + msg = "Allowed task runtime exceeds provider walltime. Tasks might get killed by the scheduler." + warnings.warn(msg) self.max_runtime = max_runtime - # TODO: check that WQ kwargs do not exceed resources? + # set default WQ resource specs TODO: type_hint + self.spec = None + if self.executor_type == "workqueue": + self.spec = { + "cores": self.kwargs["cores_per_task"], + "memory": int(self.kwargs["mem_per_task"] * 1000), # in MB + "gpus": self.kwargs["gpus_per_task"], + "disk": 0, # not implemented + "running_time_min": self.kwargs["min_runtime"], + } + register_definition(definition=self) + # TODO: how to handle env variables? + pass @property @@ -251,7 +271,7 @@ def lifetime(self) -> float: @property def use_gpu(self) -> bool: - return self.kwargs.get("use_gpu") or self.kwargs.get("gpus_per_task") > 0 + return self.kwargs.get("use_gpu") or self.kwargs.get("gpus_per_task", 0) > 0 def wrap_in_timeout(self, command: str) -> str: if self.max_runtime == float("inf"): @@ -315,28 +335,13 @@ def create_executor(self, path: Path) -> ParslExecutor: return self._create_workqueue(path) def wq_resources(self, *args, **kwargs) -> dict: - if self.executor_type == "threadpool": - return {} - - # TODO: why recreate every call? - # TODO: priority - spec = { - "cores": self.kwargs["cores_per_task"], - "memory": int(self.kwargs["mem_per_task"] * 1000), # in MB - "gpus": self.kwargs["gpus_per_task"], - "disk": 0, # not implemented - "running_time_min": self.kwargs["min_runtime"], - } - return self._modify_wq_resources(spec, *args, **kwargs) - - def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: raise NotImplementedError @classmethod def from_config( cls, - executor: str = "workqueue", - container: Optional[ContainerSpec] = None, + executor: str, + container: Optional[ContainerSpec], **kwargs, ): if executor == "threadpool": @@ -394,6 +399,8 @@ def __init__( # TODO: temporary self.cores_per_worker = self.kwargs.get("cores_per_task", 1) self.gpu = False + self.max_simulation_time = self.max_runtime + self.env_vars = {"OMP_NUM_THREADS": "1"} # TODO: what with env vars? # default_env_vars = { @@ -418,8 +425,11 @@ def get_driver_devices(self, nwalkers: int) -> list[dict]: else: return [{"device": "cpu"} for _ in range(nclients)] - def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: - pass + def wq_resources(self, nwalkers: int) -> dict: + if self.spec is None: + return {} # threadpool + + return self.spec # def wq_resources(self, nwalkers): # if self.use_threadpool: @@ -471,8 +481,11 @@ def train_command(self, initialize: bool = False): command = "psiflow-mace-train" return self.wrap_in_timeout(command) - def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: - pass + def wq_resources(self, *args, **kwargs) -> dict: + if self.spec is None: + return {} # threadpool + + return self.spec # def wq_resources(self): # if self.use_threadpool: @@ -624,7 +637,9 @@ def from_config( ) model_training = ModelTraining.from_config( container=base_container, - **kwargs.pop("ModelTraining", {"gpu": True}), # avoid triggering assertion + **kwargs.pop( + "ModelTraining", {"gpu": True} + ), # avoid triggering assertion TODO: change into warning ) # TODO: remove this and check below @@ -719,19 +734,23 @@ def load( ) -> ExecutionContext: if cls._context is not None: raise RuntimeError("ExecutionContext has already been loaded") - if config is None: - if len(sys.argv) == 1: # no yaml config passed, use threadpools: - config = DEFAULT_CONFIG - else: - assert len(sys.argv) == 2 - path_config = psiflow.resolve_and_check(Path(sys.argv[1])) - assert path_config.exists() - assert path_config.suffix in [".yaml", ".yml"], ( - f"the execution configuration needs to be specified" - f" as a YAML file, but got {path_config}" - ) - with open(path_config, "r") as f: - config = yaml.safe_load(f) + if config is not None: + pass + elif len(sys.argv) == 1: + config = {} + else: + assert len(sys.argv) <= 2 # only accept a single argument + path_config = Path(sys.argv[1]) + assert path_config.exists() + assert path_config.suffix in [".yaml", ".yml"], ( + f"the execution configuration needs to be specified" + f" as a YAML file, but got {path_config}" + ) + with open(path_config, "r") as f: + config = yaml.safe_load(f) + + # set the context so it can be retrieved later + config = yaml.safe_load(DEFAULT_CONFIG) | config cls._context = ExecutionContext.from_config(**config) return cls._context @@ -797,3 +816,57 @@ def _get_launch_command(self, block_id): # else: # dest.touch(exist_ok=True) # src.symlink_to(dest, target_is_directory=is_dir) + + +# TODO: attempt at managing priority through global state +WQ_RESOURCES_REGISTRY = {} + + +def register_definition(definition: ExecutionDefinition) -> None: + """""" + if (spec := definition.spec) is None: + return # threadpool does not have priority + + WQ_RESOURCES_REGISTRY[definition.name] = spec + spec["priority"] = SetWQPriority.default + + +class SetWQPriority: + """Manage the WQ priority tag as context manager""" + + # TODO: this probably does not work in a nested way + # TODO: log to parsl.log? + default = 0 + + def __init__(self, value: int, verbose: bool = False) -> None: + self.value = value + self.verbose = verbose + + def __enter__(self): + if self.verbose: + print(f'SetWQPriority setting priority:\t{self.value}') + for n, spec in WQ_RESOURCES_REGISTRY.items(): + spec["priority"] = self.value + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.verbose: + print(f'SetWQPriority unsetting {self.value}') + for n, spec in WQ_RESOURCES_REGISTRY.items(): + spec["priority"] = SetWQPriority.default + + +# This is the default psiflow config which is always passed into the ExecutionContext +# TODO: find a place for this +DEFAULT_CONFIG = """ +parsl_log_level: WARNING +usage_tracking: 3 + +ModelEvaluation: + executor: threadpool + max_threads: 2 + +ModelTraining: + executor: threadpool + max_threads: 2 +""" From 4e896d7b3d4bb94da588f2a9015d480f7c595f79 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Fri, 27 Feb 2026 17:14:27 +0100 Subject: [PATCH 04/15] cleanup psiflow_internal remove the annoying '000' nesting level and place all executor output directly in psiflow_internal --- psiflow/execution.py | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/psiflow/execution.py b/psiflow/execution.py index 4ec0c70..79b3745 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -624,6 +624,7 @@ def from_config( if path.exists(): shutil.rmtree(path) path.mkdir(parents=True) + patch_parsl_dirtree() log_file = str(path / "parsl.log") log_level = getattr(logging, parsl_log_level) @@ -708,20 +709,7 @@ def from_config( internal_tasks_max_threads=internal_tasks_max_threads, # std_autopath=std_autopath, ) - context = ExecutionContext(config, definitions, path / "context_dir") - - # if make_symlinks: - # src, dest = Path.cwd() / "psiflow_log", path / "parsl.log" - # _create_symlink(src, dest) - # src, dest = ( - # Path.cwd() / "psiflow_submit_scripts", - # path / "000" / "submit_scripts", - # ) - # _create_symlink(src, dest, is_dir=True) - # src, dest = Path.cwd() / "psiflow_task_logs", path / "000" / "task_logs" - # _create_symlink(src, dest, is_dir=True) - - return context + return ExecutionContext(config, definitions, path / "context_dir") class ExecutionContextLoader: @@ -807,17 +795,6 @@ def _get_launch_command(self, block_id): return self.worker_command -# def _create_symlink(src: Path, dest: Path, is_dir: bool = False) -> None: -# """Create or replace symbolic link""" -# if src.is_symlink(): -# src.unlink() -# if is_dir: -# dest.mkdir(parents=True, exist_ok=True) -# else: -# dest.touch(exist_ok=True) -# src.symlink_to(dest, target_is_directory=is_dir) - - # TODO: attempt at managing priority through global state WQ_RESOURCES_REGISTRY = {} @@ -844,14 +821,14 @@ def __init__(self, value: int, verbose: bool = False) -> None: def __enter__(self): if self.verbose: - print(f'SetWQPriority setting priority:\t{self.value}') + print(f"SetWQPriority setting priority:\t{self.value}") for n, spec in WQ_RESOURCES_REGISTRY.items(): spec["priority"] = self.value return self def __exit__(self, exc_type, exc_val, exc_tb): if self.verbose: - print(f'SetWQPriority unsetting {self.value}') + print(f"SetWQPriority unsetting {self.value}") for n, spec in WQ_RESOURCES_REGISTRY.items(): spec["priority"] = SetWQPriority.default @@ -870,3 +847,12 @@ def __exit__(self, exc_type, exc_val, exc_tb): executor: threadpool max_threads: 2 """ + + +def patch_parsl_dirtree() -> None: + """By default, Parsl will put Executor logs etc. under numbered directories. + We do not need this level of nesting, as psiflow_internal is refreshed every run""" + import parsl.dataflow.dflow + + # replace with noop, which needs to happen after parsl.dataflow.dflow initialises + parsl.dataflow.dflow.make_rundir = lambda x: x From 9c48ce0eae53ea2e3065fa341cd765b4c8d7a21a Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Mon, 2 Mar 2026 23:09:00 +0100 Subject: [PATCH 05/15] update execution part 2 - further cleanup execution - redo `ModelEvaluation` + threadpool (workqueue still needs work) - make bash app template for all bash apps. It is now possible to specify where tmpdirs are created through the `tmpdir_root` config option. Also, you can specify whether tmpdirs should be removed after the tasks finish (for debugging purposes) ATM, `ModelTraining` and `ReferenceEvaluation` are moderately broken, most likely --- psiflow/__init__.py | 3 - psiflow/execution.py | 250 ++++++++++++++++++--------------- psiflow/functions.py | 1 + psiflow/hamiltonians.py | 4 +- psiflow/models/_mace.py | 2 +- psiflow/order_parameters.py | 2 - psiflow/reference/cp2k_.py | 9 +- psiflow/reference/gpaw_.py | 3 - psiflow/reference/orca_.py | 8 +- psiflow/reference/reference.py | 5 +- psiflow/sampling/ase.py | 19 +-- psiflow/sampling/optimize.py | 10 +- psiflow/sampling/sampling.py | 14 +- psiflow/sampling/server.py | 1 - psiflow/sampling/walker.py | 10 +- psiflow/serialization.py | 4 +- psiflow/utils/__init__.py | 8 -- 17 files changed, 172 insertions(+), 181 deletions(-) delete mode 100644 psiflow/order_parameters.py diff --git a/psiflow/__init__.py b/psiflow/__init__.py index e8de3e3..73f2e3f 100644 --- a/psiflow/__init__.py +++ b/psiflow/__init__.py @@ -1,7 +1,5 @@ from pathlib import Path -import typeguard - from .config import setup_slurm_config # noqa: F401 from .execution import ExecutionContextLoader from .serialization import ( # noqa: F401 @@ -12,7 +10,6 @@ ) -@typeguard.typechecked def resolve_and_check(path: Path) -> Path: path = path.resolve() if Path.cwd() in path.parents: diff --git a/psiflow/execution.py b/psiflow/execution.py index 79b3745..2a3e0bc 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -4,13 +4,14 @@ import sys import warnings import subprocess +import textwrap from datetime import datetime, timedelta from dataclasses import dataclass from pathlib import Path from threading import Lock # see https://stackoverflow.com/questions/59904631/python-class-constants-in-dataclasses -from typing import Any, Optional, Union, ClassVar, Protocol, Iterable +from typing import Any, Optional, Union, ClassVar, Protocol, Iterable, Sequence import parsl import psutil @@ -71,9 +72,9 @@ class ReferenceSpec(Protocol): """Defines default options for Reference implementations""" name: ClassVar[str] - reference_args: ClassVar[tuple[str, ...]] + reference_args: ClassVar[tuple[str, ...]] # TODO: update 'cores_per_worker' mpi_command: str - mpi_args: Iterable[str] + mpi_args: Sequence[str] executable: str def launch_command(self) -> str: @@ -90,7 +91,7 @@ class CP2KReferenceSpec(ReferenceSpec): name = "CP2K" reference_args = ("cores_per_worker",) mpi_command: str = "mpirun -np {cores_per_worker}" - mpi_args: tuple[str, ...] = ( + mpi_args: Sequence[str] = ( "-ENV OMP_NUM_THREADS=1", "--bind-to core", "--map-by core", @@ -106,7 +107,7 @@ class GPAWReferenceSpec(ReferenceSpec): name = "GPAW" reference_args = ("cores_per_worker",) mpi_command: str = "mpirun -np {cores_per_worker}" - mpi_args: tuple[str, ...] = ( + mpi_args: Sequence[str] = ( "-x OMP_NUM_THREADS=1", "--bind-to core", "--map-by core", @@ -122,7 +123,7 @@ class ORCAReferenceSpec(ReferenceSpec): name = "ORCA" reference_args = () mpi_command: str = "" - mpi_args: tuple[str, ...] = ( + mpi_args: Sequence[str] = ( "-x OMP_NUM_THREADS=1", "--bind-to core", "--map-by core", @@ -141,12 +142,6 @@ def launch_command(self): } -def str_to_timedelta(s: str) -> timedelta: - # TODO: move to utils - t = datetime.strptime(s, "%H:%M:%S") - return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) - - def make_slurm_provider(kwargs: dict) -> tuple[SlurmProvider, dict]: defaults = {"init_blocks": 0, "exclusive": False} required = ("cores_per_node", "walltime", "gpus_per_node") @@ -166,7 +161,7 @@ def make_slurm_provider(kwargs: dict) -> tuple[SlurmProvider, dict]: def make_local_provider(kwargs: dict) -> tuple[LocalProvider, dict]: resources = { "nodes": 1, - "cores": kwargs.get("cores", psutil.cpu_count()), + "cores": kwargs.get("cores", psutil.cpu_count(logical=False)), "memory": kwargs.get( "memory", psutil.virtual_memory().available / 1e9 ), # TODO: available? @@ -190,8 +185,31 @@ def make_local_provider(kwargs: dict) -> tuple[LocalProvider, dict]: return provider, resources +def make_default_executors( + max_workers: int, path: Path, container: ContainerSpec +) -> tuple[HighThroughputExecutor, ThreadPoolExecutor]: + """Construct executors for internal app handling""" + launcher = SimpleLauncher() + if container is not None: + launcher = WrappedLauncher(prepend=container.launch_command()) + + htex = HighThroughputExecutor( + label="default_htex", + working_dir=str(path / "default_htex"), + cores_per_worker=1, + max_workers_per_node=max_workers, + cpu_affinity="none", + provider=LocalProvider(launcher=launcher, init_blocks=0), + ) + threadpool = ThreadPoolExecutor( + label="default_threads", + max_threads=max_workers, + working_dir=str(path), + ) + return htex, threadpool + + class ExecutionDefinition: - # TODO: do not like defining some kwargs in class method and other kwargs in init... def __init__( self, provider: ExecutionProvider | None, @@ -257,6 +275,7 @@ def __init__( register_definition(definition=self) # TODO: how to handle env variables? + # TODO: check between min_runtime and max_runtime? pass @@ -271,7 +290,16 @@ def lifetime(self) -> float: @property def use_gpu(self) -> bool: - return self.kwargs.get("use_gpu") or self.kwargs.get("gpus_per_task", 0) > 0 + if self.executor_type == "threadpool": + return self.kwargs["use_gpu"] + return self.kwargs["gpus_per_task"] > 0 + + @property + def cores_per_task(self) -> int: + if self.executor_type == "workqueue": + return self.kwargs["cores_per_task"] + # assumes all threads are working + return int(self.resources["cores"] / self.kwargs["max_threads"]) def wrap_in_timeout(self, command: str) -> str: if self.max_runtime == float("inf"): @@ -313,8 +341,9 @@ def _create_workqueue(self, path: Path) -> WorkQueueExecutor: prepend = self.container.launch_command() worker_executable = f"{prepend} {worker_executable}" - # TODO: why the custom WQ? - executor = MyWorkQueueExecutor( + # TODO: why the custom WQ? -- does not seem necessary (anymore) + # executor = MyWorkQueueExecutor( + executor = WorkQueueExecutor( label=self.name, working_dir=str(path / self.name), provider=self.provider, @@ -340,7 +369,7 @@ def wq_resources(self, *args, **kwargs) -> dict: @classmethod def from_config( cls, - executor: str, + executor: str, # TODO: no default value? container: Optional[ContainerSpec], **kwargs, ): @@ -390,26 +419,24 @@ def from_config( class ModelEvaluation(ExecutionDefinition): def __init__( self, - timeout: float = 5, # TODO: units? + timeout: float = 5.0, **kwargs, ): super().__init__(**kwargs) - self.timeout = timeout - # TODO: temporary - self.cores_per_worker = self.kwargs.get("cores_per_task", 1) - self.gpu = False - self.max_simulation_time = self.max_runtime - self.env_vars = {"OMP_NUM_THREADS": "1"} + self.timeout = timeout # i-Pi will kill client connections after no response for timeout seconds - # TODO: what with env vars? - # default_env_vars = { - # "OMP_NUM_THREADS": str(self.cores_per_worker), - # "KMP_AFFINITY": "granularity=fine,compact,1,0", - # "KMP_BLOCKTIME": "1", - # "OMP_PROC_BIND": "false", - # "PYTHONUNBUFFERED": "TRUE", - # } + if self.executor_type == "threadpool": + # disable thread affinity and busy-idling + env_vars = { + "OMP_PROC_BIND": "FALSE", + "OMP_WAIT_POLICY": "PASSIVE", + "OMP_NUM_THREADS": f"{self.cores_per_task}", + # "OMP_DISPLAY_ENV": "VERBOSE", # verbose OMP log + } + else: + assert False, "IMPLEMENT THIS" + self.env_vars = env_vars | self.env_vars def server_command(self) -> str: command = "psiflow-server" @@ -420,7 +447,7 @@ def get_driver_devices(self, nwalkers: int) -> list[dict]: # TODO: what if only 1 gpu is available? Redo this # nclients = min(nwalkers, self.max_workers) nclients = min(nwalkers, 2) - if self.gpu: + if self.use_gpu: return [{"device": f"cuda:{i}"} for i in range(nclients)] else: return [{"device": "cpu"} for _ in range(nclients)] @@ -428,7 +455,7 @@ def get_driver_devices(self, nwalkers: int) -> list[dict]: def wq_resources(self, nwalkers: int) -> dict: if self.spec is None: return {} # threadpool - + # TODO: reimplement this return self.spec # def wq_resources(self, nwalkers): @@ -464,6 +491,11 @@ def __init__( ) assert self.max_runtime is None, message + if not self.use_gpu: + warnings.warn( + "ModelTraining is configured for CPU operation. Is this what you want?" + ) + # default_env_vars = { # "OMP_NUM_THREADS": str(self.cores_per_worker), # "KMP_AFFINITY": "granularity=fine,compact,1,0", @@ -484,7 +516,7 @@ def train_command(self, initialize: bool = False): def wq_resources(self, *args, **kwargs) -> dict: if self.spec is None: return {} # threadpool - + # TODO: reimplement this return self.spec # def wq_resources(self): @@ -511,13 +543,14 @@ def wq_resources(self, *args, **kwargs) -> dict: class ReferenceEvaluation(ExecutionDefinition): def __init__( self, - spec: "ReferenceSpec", + reference: "ReferenceSpec", memory_limit: Optional[str] = None, # TODO: how does this work? **kwargs, ) -> None: # TODO: how to know which code? + # before super().__init__ because 'name' attribute needed + self.reference = reference super().__init__(**kwargs) - self.spec = spec self.memory_limit = memory_limit def command(self): @@ -548,12 +581,14 @@ def parse_size(size): # TODO: to utils? # exit code 0 so parsl always thinks bash app succeeded return "\n".join([*commands, launch_command, "exit 0"]) - def _modify_wq_resources(self, spec: dict, *args, **kwargs) -> dict: - return spec + def wq_resources(self, *args, **kwargs) -> dict: + if self.spec is None: + return {} # threadpool + return self.spec @property def name(self) -> str: - return self.spec.name + return self.reference.name class ExecutionContext: @@ -574,12 +609,21 @@ def __init__( config: Config, definitions: list[ExecutionDefinition], path: Union[Path, str], + tmpdir_root: str, + keep_tmpdirs: bool, + **kwargs, ) -> None: self.config = config self.path = Path(path).resolve() self.path.mkdir(parents=True, exist_ok=True) + self.definitions = {d.name: d for d in definitions} assert len(self.definitions) == len(definitions) + + # make sure task tmpdirs can be made + Path(tmpdir_root).mkdir(parents=True, exist_ok=True) + self.bash_template = create_bash_template(tmpdir_root, keep_tmpdirs) + self.file_index = {} self.lock = Lock() parsl.load(config) @@ -608,16 +652,8 @@ def new_file(self, prefix: str, suffix: str) -> File: @classmethod def from_config( cls, - parsl_log_level: str = "WARNING", - usage_tracking: int = 3, - retries: int = 2, - strategy: str = "simple", - max_idletime: float = 20, - internal_tasks_max_threads: int = 10, - default_threads: int = 4, - # htex_address: str = "127.0.0.1", - zip_staging: Optional[bool] = None, - make_symlinks: bool = False, + parsl_log_level: str, + default_threads: int, **kwargs, ) -> "ExecutionContext": path = Path.cwd().resolve() / PSIFLOW_INTERNAL @@ -633,28 +669,18 @@ def from_config( # create definitions base_container = ContainerSpec.from_kwargs(kwargs) model_evaluation = ModelEvaluation.from_config( - container=base_container, - **kwargs.pop("ModelEvaluation", {}), + container=base_container, **kwargs["ModelEvaluation"] ) model_training = ModelTraining.from_config( - container=base_container, - **kwargs.pop( - "ModelTraining", {"gpu": True} - ), # avoid triggering assertion TODO: change into warning + container=base_container, **kwargs["ModelTraining"] ) - # TODO: remove this and check below - model_evaluation.wq_resources(0) - model_evaluation.server_command() - model_training.wq_resources() - reference_evaluations = [] # reference evaluations might be class specific for key in list(kwargs.keys()): if key[:4] in REFERENCE_SPECS: # allow for e.g., CP2K_small - config = kwargs.pop(key) + config = kwargs[key] reference_evaluation = ReferenceEvaluation.from_config( - # spec=init_spec(REFERENCE_SPECS[key[:4]], config), - spec=REFERENCE_SPECS[key[:4]].from_kwargs(**config), + reference=REFERENCE_SPECS[key[:4]].from_kwargs(**config), container=ContainerSpec.from_kwargs(kwargs | config), **config, ) @@ -663,53 +689,13 @@ def from_config( # create main parsl executors executors = [d.create_executor(path=path) for d in definitions] + internal = make_default_executors(default_threads, path, base_container) + executors.extend(internal) - # create default executors - # TODO: extract this into function - if base_container is not None: - launcher = WrappedLauncher(prepend=base_container.launch_command()) - else: - launcher = SimpleLauncher() - htex = HighThroughputExecutor( - label="default_htex", - # address=htex_address, - working_dir=str(path / "default_htex"), - cores_per_worker=1, - max_workers_per_node=default_threads, - cpu_affinity="none", - provider=LocalProvider(launcher=launcher, init_blocks=0), # noqa: F405 - ) - threadpool = ThreadPoolExecutor( - label="default_threads", - max_threads=default_threads, - working_dir=str(path), - ) - executors.extend([htex, threadpool]) - - # remove additional kwargs - # if zip_staging: - - # def zip_uri(base, task_record, err_or_out): - # zip_path = base / "base.zip" - # file = f"{task_record['func_name']}.{task_record['id']}.{task_record['try_id']}.{err_or_out}" - # return File(f"zip:{zip_path}/{file}") - - # std_autopath = partial(zip_uri, path) - # else: - # std_autopath = None config = Config( - executors=executors, - run_dir=str(path), - initialize_logging=False, - # app_cache=False, - usage_tracking=usage_tracking, - retries=retries, - strategy=strategy, - max_idletime=max_idletime, - internal_tasks_max_threads=internal_tasks_max_threads, - # std_autopath=std_autopath, + executors=executors, run_dir=str(path), initialize_logging=False ) - return ExecutionContext(config, definitions, path / "context_dir") + return ExecutionContext(config, definitions, path / "context_dir", **kwargs) class ExecutionContextLoader: @@ -789,12 +775,14 @@ def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> s return x -class MyWorkQueueExecutor(WorkQueueExecutor): - # TODO: what does this do? - def _get_launch_command(self, block_id): - return self.worker_command +# class MyWorkQueueExecutor(WorkQueueExecutor): +# # TODO: what does this do? +# def _get_launch_command(self, block_id): +# return self.worker_command +# TODO: move everything below to appropriate files + # TODO: attempt at managing priority through global state WQ_RESOURCES_REGISTRY = {} @@ -838,6 +826,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): DEFAULT_CONFIG = """ parsl_log_level: WARNING usage_tracking: 3 +default_threads: 4 +max_idletime: 20 +tmpdir_root: /tmp +keep_tmpdirs: false +gpu_flavour: nvidia ModelEvaluation: executor: threadpool @@ -856,3 +849,34 @@ def patch_parsl_dirtree() -> None: # replace with noop, which needs to happen after parsl.dataflow.dflow initialises parsl.dataflow.dflow.make_rundir = lambda x: x + + +# TODO: arguments that need documenting: retries, strategy?, timeout, garbage_collect (Config) + + +def create_bash_template(tmpdir_root: str, keep_tmpdirs: bool) -> str: + """Create general wrapper for all bash apps. The exitcode ensures that every app completes successfully.""" + template = f""" + # Create and move into new tmpdir for app execution + tmpdir=$(mktemp -d -p {tmpdir_root} "psiflow-tmp.XXXXXXXXXX") + cd $tmpdir; echo "tmpdir: $PWD" + export {{env}} + printenv + + # Actual app definition goes here + {{commands}} + + # Cleanup + {'cd ../.. && rm -r $tmpdir' if not keep_tmpdirs else ''} + exit 0 + """ + return textwrap.dedent(template) + + +def format_env_vars(env_vars: dict) -> str: + return " ".join([f"{k}={v}" for k, v in env_vars.items()]) + + +def str_to_timedelta(s: str) -> timedelta: + t = datetime.strptime(s, "%H:%M:%S") + return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) \ No newline at end of file diff --git a/psiflow/functions.py b/psiflow/functions.py index e12bec9..8689616 100644 --- a/psiflow/functions.py +++ b/psiflow/functions.py @@ -296,6 +296,7 @@ class DispersionFunction(EnergyFunction): def __post_init__(self): # OMP_NUM_THREADS for parallel evaluation does not work.. # https://github.com/dftd3/simple-dftd3/issues/49 + # TODO: check whether this is still the case os.environ["OMP_NUM_THREADS"] = str(self.num_threads * 10) from dftd3.ase import DFTD3 diff --git a/psiflow/hamiltonians.py b/psiflow/hamiltonians.py index a065484..59ca43e 100644 --- a/psiflow/hamiltonians.py +++ b/psiflow/hamiltonians.py @@ -444,9 +444,9 @@ def parameters(self) -> dict: return { "model_path": model_path, "atomic_energies": self.atomic_energies, - "ncores": evaluation.cores_per_worker, + "ncores": evaluation.cores_per_task, "dtype": "float32", - "device": "gpu" if evaluation.gpu else "cpu", + "device": "gpu" if evaluation.use_gpu else "cpu", "env_vars": evaluation.env_vars, } diff --git a/psiflow/models/_mace.py b/psiflow/models/_mace.py index 5ed6772..46fc04d 100644 --- a/psiflow/models/_mace.py +++ b/psiflow/models/_mace.py @@ -227,7 +227,7 @@ def _create_apps(self): # initialize apps app_initialize = bash_app(initialize, executors=[evaluation.name]) resources_init = evaluation.wq_resources(1) # TODO: find a better way for model init - if not evaluation.use_threadpool: + if not evaluation.executor_type == "threadpool": resources_init["running_time_min"] = 30 # at least 30 mins for init? app_train = bash_app(train, executors=[training.name]) resources_train = training.wq_resources() diff --git a/psiflow/order_parameters.py b/psiflow/order_parameters.py deleted file mode 100644 index 204de2e..0000000 --- a/psiflow/order_parameters.py +++ /dev/null @@ -1,2 +0,0 @@ -class OrderParameter: - pass diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py index 6af6095..eb30e73 100644 --- a/psiflow/reference/cp2k_.py +++ b/psiflow/reference/cp2k_.py @@ -15,8 +15,6 @@ from psiflow.geometry import Geometry from psiflow.reference.reference import Reference, Status, get_spin_multiplicities from psiflow.utils.parse import find_line, lines_to_array -from psiflow.utils import TMP_COMMAND, CD_COMMAND - # costly to initialise input_parser = CP2KInputParserSimplified( @@ -141,12 +139,7 @@ def get_single_atom_references(self, element: str) -> dict[int, Reference]: return references def get_shell_command(self, inputs: list[File]) -> str: - command_list = [ - TMP_COMMAND, - CD_COMMAND, - f"cp {inputs[0].filepath} cp2k.inp", - self.execute_command, - ] + command_list = [f"cp {inputs[0].filepath} cp2k.inp", self.execute_command] return "\n".join(command_list) def parse_output(self, stdout: str) -> dict: diff --git a/psiflow/reference/gpaw_.py b/psiflow/reference/gpaw_.py index 5c6d5bd..a119271 100644 --- a/psiflow/reference/gpaw_.py +++ b/psiflow/reference/gpaw_.py @@ -8,7 +8,6 @@ import psiflow from psiflow.geometry import Geometry from psiflow.reference.reference import Reference, Status -from psiflow.utils import TMP_COMMAND, CD_COMMAND from psiflow.utils.apps import copy_app_future from psiflow.utils.parse import find_line from psiflow.reference._gpaw import FILEPATH, DEFAULTS, STDOUT_KEY @@ -58,8 +57,6 @@ def compute_atomic_energy(self, element, box_size=None) -> AppFuture: def get_shell_command(self, inputs: list[File]) -> str: command_list = [ - TMP_COMMAND, - CD_COMMAND, f"cp {inputs[0].filepath} input.json", f"cp {self.script} script_gpaw.py", self.execute_command, diff --git a/psiflow/reference/orca_.py b/psiflow/reference/orca_.py index 05cd2d1..ca14115 100644 --- a/psiflow/reference/orca_.py +++ b/psiflow/reference/orca_.py @@ -12,7 +12,6 @@ import psiflow from psiflow.geometry import Geometry from psiflow.reference.reference import Reference, Status, get_spin_multiplicities -from psiflow.utils import TMP_COMMAND, CD_COMMAND from psiflow.utils.parse import find_line, lines_to_array, string_to_timedelta @@ -162,12 +161,7 @@ def get_single_atom_references(self, element: str) -> dict[int, Reference]: return references def get_shell_command(self, inputs: list[File]) -> str: - command_list = [ - TMP_COMMAND, - CD_COMMAND, - f"cp {inputs[0].filepath} orca.inp", - self.execute_command, - ] + command_list = [f"cp {inputs[0].filepath} orca.inp", self.execute_command] return "\n".join(command_list) def parse_output(self, stdout: str) -> dict: diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index 96ff842..6a8c008 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -98,7 +98,10 @@ def _execute( stderr: str = parsl.AUTO_LOGNAME, label: str = "singlepoint", ) -> str: - return reference.get_shell_command(inputs) + # TODO: we do not set env_vars here? + command = reference.get_shell_command(inputs) + template = psiflow.context().bash_template + return template.format(commands=command, env='>/dev/null') def _process_output( diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py index 87434fe..cd341fb 100644 --- a/psiflow/sampling/ase.py +++ b/psiflow/sampling/ase.py @@ -11,12 +11,12 @@ from psiflow.hamiltonians import Hamiltonian from psiflow.utils.apps import setup_logger from psiflow.utils.io import _dump_json -from psiflow.utils import TMP_COMMAND, CD_COMMAND, export_env_command from psiflow.utils.parse import get_task_name_id +from psiflow.execution import format_env_vars from ._ase import ALLOWED_MODES, __file__ as file_ase -DEFAULT_EXECUTABLE = 'script.py' +DEFAULT_EXECUTABLE = "script.py" logger = setup_logger(__name__) # logging per module @@ -63,14 +63,13 @@ def _execute_ase( command_opt_args.append(f"--output_traj={outputs[1].filepath}") command_list = [ - TMP_COMMAND, - CD_COMMAND, - export_env_command(env_vars), f"cp {inputs[0].filepath} {DEFAULT_EXECUTABLE}", " ".join(command_opt_args), "exit 0", # ignore timeout exitcode ] - return "\n".join(command_list) + template = psiflow.context().bash_template + commands, env = "\n".join(command_list), format_env_vars(env_vars) + return template.format(commands=commands, env=env) execute_ase = bash_app(_execute_ase, executors=["ModelEvaluation"]) @@ -93,12 +92,8 @@ def optimize( context = psiflow.context() definition = context.definitions["ModelEvaluation"] - - command_list = [f"python -u {DEFAULT_EXECUTABLE}"] - if definition.max_simulation_time is not None: - max_time = 0.9 * (60 * definition.max_simulation_time) - command_list = ["timeout -s 15 {}s".format(max_time), *command_list] - command_launch = " ".join(command_list) + command = f"python -u {DEFAULT_EXECUTABLE}" + command_launch = definition.wrap_in_timeout(command) input_geometry = Dataset([state]).extxyz # state can be future hamiltonian = 1.0 * hamiltonian # convert to mixture diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index 74d7d54..b92f423 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -20,7 +20,7 @@ ) from psiflow.sampling.output import HamiltonianComponent from psiflow.utils.io import save_xml -from psiflow.utils import TMP_COMMAND, CD_COMMAND, export_env_command +from psiflow.execution import format_env_vars warnings.warn( @@ -111,17 +111,15 @@ def _execute_ipi( set(d["address"] for d in driver_kwargs) ) commands_driver = make_driver_commands(driver_kwargs, file_xyz_in, files_in) - command_list = [ - TMP_COMMAND, - CD_COMMAND, - export_env_command(env_vars), command_start, command_wait, *commands_driver, "wait", ] - return "\n".join(command_list) + template = psiflow.context().bash_template + commands, env = "\n".join(command_list), format_env_vars(env_vars) + return template.format(commands=commands, env=env) execute_ipi = bash_app(_execute_ipi, executors=["ModelEvaluation"]) diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index 4328ea3..25d0d52 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -19,10 +19,10 @@ potential_component_name, HamiltonianComponent, ) +from psiflow.execution import format_env_vars from psiflow.sampling.utils import create_xml_list from psiflow.sampling.walker import Coupling, Walker, partition, Ensemble from psiflow.utils.io import _save_xml -from psiflow.utils import TMP_COMMAND, CD_COMMAND, export_env_command from psiflow.sampling.driver import __file__ as PATH_DRIVER @@ -113,7 +113,7 @@ def setup_sockets(components: list[HamiltonianComponent]) -> list[ET.Element]:
{address}
""" - timeout = 60 * psiflow.context().definitions["ModelEvaluation"].timeout + timeout = psiflow.context().definitions["ModelEvaluation"].timeout sockets = [] for comp in components: @@ -450,10 +450,7 @@ def _execute_ipi( commands_driver = make_driver_commands(driver_kwargs, file_xyz_in, files_in) command_list = [ - TMP_COMMAND, - CD_COMMAND, - "\n".join(write_command_args), - export_env_command(env_vars), + *write_command_args, command_start, command_wait, *commands_driver, @@ -461,7 +458,10 @@ def _execute_ipi( ] if coupling_command: command_list.append(coupling_command) - return "\n".join(command_list) + + template = psiflow.context().bash_template + commands, env = "\n".join(command_list), format_env_vars(env_vars) + return template.format(commands=commands, env=env) execute_ipi = bash_app(_execute_ipi, executors=["ModelEvaluation"]) diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py index 2fe3c71..6ade9fe 100644 --- a/psiflow/sampling/server.py +++ b/psiflow/sampling/server.py @@ -97,7 +97,6 @@ def run(start_xyz: str, input_xml: str): # prepare starting geometries from context_dir data_start: list[ase.Atoms] = read(start_xyz, index=":") for i, at in enumerate(data_start): - print(at.pbc) if not any(at.pbc): # set fake large cell for i-PI at.pbc = True at.cell = Cell(NONPERIODIC_CELL) diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py index 0b2cdec..9b2ce9c 100644 --- a/psiflow/sampling/walker.py +++ b/psiflow/sampling/walker.py @@ -14,7 +14,7 @@ from psiflow.data import Dataset from psiflow.geometry import Geometry, check_equality from psiflow.hamiltonians import Hamiltonian, Zero, combine_hamiltonians -from psiflow.order_parameters import OrderParameter +# from psiflow.order_parameters import OrderParameter from psiflow.sampling.metadynamics import Metadynamics from psiflow.utils.apps import copy_app_future @@ -78,7 +78,7 @@ class Walker: masses: Union[np.ndarray, float, None] = None nbeads: int = 1 metadynamics: Optional[Metadynamics] = None - order_parameter: Optional[OrderParameter] = None + # order_parameter: Optional['OrderParameter'] = None state: Union[Geometry, AppFuture] = field(init=False) coupling: Optional[Coupling] = field(init=False) @@ -96,9 +96,9 @@ def __post_init__(self): # we cannot check this for futures assert self.pressure is None, "Pressure requires PBC" - if self.order_parameter is not None: - # TODO: order_parameter out of commission - self.start = self.order_parameter.evaluate(self.start) + # if self.order_parameter is not None: + # # TODO: order_parameter out of commission + # self.start = self.order_parameter.evaluate(self.start) if (m := self.masses) is None: pass # do nothing diff --git a/psiflow/serialization.py b/psiflow/serialization.py index c37eea6..6e49038 100644 --- a/psiflow/serialization.py +++ b/psiflow/serialization.py @@ -279,7 +279,7 @@ def deserialize(data_str: str, custom_cls: Optional[list] = None): from psiflow.learning import Learning from psiflow.metrics import Metrics from psiflow.models import MACE - from psiflow.order_parameters import OrderParameter + # from psiflow.order_parameters import OrderParameter from psiflow.reference import CP2K, GPAW, ORCA, ReferenceDummy from psiflow.sampling import Metadynamics, ReplicaExchange, SimulationOutput, Walker @@ -300,7 +300,7 @@ def deserialize(data_str: str, custom_cls: Optional[list] = None): Harmonic, MixtureHamiltonian, Metadynamics, - OrderParameter, + # OrderParameter, ReplicaExchange, SimulationOutput, Walker, diff --git a/psiflow/utils/__init__.py b/psiflow/utils/__init__.py index e103e43..e69de29 100644 --- a/psiflow/utils/__init__.py +++ b/psiflow/utils/__init__.py @@ -1,8 +0,0 @@ -TMP_COMMAND = 'tmpdir=$(mktemp -d -p /tmp "mytmpdir.XXXXXXXXXX" || mktemp -d -t "mytmpdir.XXXXXXXXXX")' -CD_COMMAND = 'cd $tmpdir; echo "tmpdir: $PWD"' - - -def export_env_command(env_vars: dict) -> str: - return "export " + " ".join( - [f"{name}={value}" for name, value in env_vars.items()] - ) From c2d0e459dfbb16225a25b1bf3571cab107ac51ce Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 3 Mar 2026 16:34:05 +0100 Subject: [PATCH 06/15] bugfix bash apps do not know about global scope variables (unless in threadpool) --- psiflow/free_energy/phonons.py | 13 ++++++------- psiflow/sampling/ase.py | 6 +++--- psiflow/sampling/optimize.py | 5 +++-- psiflow/sampling/sampling.py | 5 +++-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/psiflow/free_energy/phonons.py b/psiflow/free_energy/phonons.py index c93dd47..2c2513c 100644 --- a/psiflow/free_energy/phonons.py +++ b/psiflow/free_energy/phonons.py @@ -17,10 +17,10 @@ make_driver_commands, make_wait_for_sockets_command, ) -from psiflow.sampling.optimize import setup_forces, export_env_command +from psiflow.sampling.optimize import setup_forces from psiflow.utils.apps import multiply from psiflow.utils.io import load_numpy, save_xml -from psiflow.utils import TMP_COMMAND, CD_COMMAND +from psiflow.execution import format_env_vars def _compute_frequencies(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: @@ -88,6 +88,7 @@ def _execute_ipi( driver_kwargs: list[dict], command_server: str, env_vars: dict = {}, + bash_template: str = "", stdout: str = parsl.AUTO_LOGNAME, stderr: str = parsl.AUTO_LOGNAME, inputs: list = [], @@ -102,18 +103,15 @@ def _execute_ipi( set(d["address"] for d in driver_kwargs) ) commands_driver = make_driver_commands(driver_kwargs, file_xyz_in, files_in) - command_list = [ - TMP_COMMAND, - CD_COMMAND, - export_env_command(env_vars), command_start, command_wait, *commands_driver, "wait", f"cp i-pi.output_full.hess {outputs[0]}", ] - return "\n".join(command_list) + commands, env = "\n".join(command_list), format_env_vars(env_vars) + return bash_template.format(commands=commands, env=env) execute_ipi = bash_app(_execute_ipi, executors=["ModelEvaluation"]) @@ -171,6 +169,7 @@ def compute_harmonic( driver_kwargs, definition.server_command(), env_vars=definition.env_vars, + bash_template=context.bash_template, inputs=inputs, outputs=[context.new_file("hess_", ".txt")], parsl_resource_specification=definition.wq_resources(1), diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py index cd341fb..e039791 100644 --- a/psiflow/sampling/ase.py +++ b/psiflow/sampling/ase.py @@ -48,6 +48,7 @@ def _execute_ase( inputs: list[DataFuture], outputs: list[DataFuture], env_vars: dict = {}, + bash_template: str = "", stdout: str = parsl.AUTO_LOGNAME, stderr: str = parsl.AUTO_LOGNAME, parsl_resource_specification: Optional[dict] = None, @@ -65,11 +66,9 @@ def _execute_ase( command_list = [ f"cp {inputs[0].filepath} {DEFAULT_EXECUTABLE}", " ".join(command_opt_args), - "exit 0", # ignore timeout exitcode ] - template = psiflow.context().bash_template commands, env = "\n".join(command_list), format_env_vars(env_vars) - return template.format(commands=commands, env=env) + return bash_template.format(commands=commands, env=env) execute_ase = bash_app(_execute_ase, executors=["ModelEvaluation"]) @@ -123,6 +122,7 @@ def optimize( result = execute_ase( command_launch=command_launch, env_vars=definition.env_vars, + bash_template=context.bash_template, inputs=inputs, outputs=outputs, parsl_resource_specification=definition.wq_resources(1), diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index b92f423..c8cd68c 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -97,6 +97,7 @@ def _execute_ipi( driver_kwargs: list[dict], command_server: str, env_vars: dict = {}, + bash_template: str = "", stdout: str = parsl.AUTO_LOGNAME, stderr: str = parsl.AUTO_LOGNAME, inputs: list = [], @@ -117,9 +118,8 @@ def _execute_ipi( *commands_driver, "wait", ] - template = psiflow.context().bash_template commands, env = "\n".join(command_list), format_env_vars(env_vars) - return template.format(commands=commands, env=env) + return bash_template.format(commands=commands, env=env) execute_ipi = bash_app(_execute_ipi, executors=["ModelEvaluation"]) @@ -186,6 +186,7 @@ def optimize( driver_kwargs, definition.server_command(), env_vars=definition.env_vars, + bash_template=context.bash_template, inputs=inputs, outputs=outputs, parsl_resource_specification=definition.wq_resources(1), diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index 25d0d52..3007c37 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -424,6 +424,7 @@ def _execute_ipi( command_server: str, *plumed_list: str, env_vars: dict = {}, + bash_template: str = "", stdout: str = parsl.AUTO_LOGNAME, stderr: str = parsl.AUTO_LOGNAME, inputs: list = [], @@ -459,9 +460,8 @@ def _execute_ipi( if coupling_command: command_list.append(coupling_command) - template = psiflow.context().bash_template commands, env = "\n".join(command_list), format_env_vars(env_vars) - return template.format(commands=commands, env=env) + return bash_template.format(commands=commands, env=env) execute_ipi = bash_app(_execute_ipi, executors=["ModelEvaluation"]) @@ -599,6 +599,7 @@ def _sample( definition.server_command(), *plumed_list, # futures env_vars=dict(definition.env_vars), + bash_template=context.bash_template, inputs=inputs, outputs=outputs, parsl_resource_specification=definition.wq_resources(max_nclients), From a0479fdb195a06a51da1053afd25511c54f9a8ec Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 4 Mar 2026 22:45:23 +0100 Subject: [PATCH 07/15] overhaul ModelEvaluation (Re)implementing some logic - to dynamically scale up MD resources depending on walkers/hamiltonians (capped by the 'max_resource_multiplier' option) - to decide how many clients to spawn for an MD run to avoid resource oversubsciption ('allow_oversubscription' options) Also 'log_dfk_tasks' for debugging figure out how many clients can be used in a simulation --- psiflow/execution.py | 188 +++++++++++++++++++++++++---------- psiflow/sampling/sampling.py | 63 ++++++++---- 2 files changed, 179 insertions(+), 72 deletions(-) diff --git a/psiflow/execution.py b/psiflow/execution.py index 2a3e0bc..854f020 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -41,9 +41,9 @@ class ContainerSpec: """Controls container configuration""" uri: str - engine: str = "apptainer" + engine: str addopts: str = " --no-eval -e --no-mount home -W /tmp --writable-tmpfs" - gpu_flavour: str | None = None # TODO: add yaml argument + gpu_flavour: str | None = None def __post_init__(self): assert self.engine in ("apptainer", "singularity") @@ -63,9 +63,9 @@ def launch_command(self) -> str: def from_kwargs(kwargs: dict) -> Optional["ContainerSpec"]: if "container_uri" not in kwargs: return None - keys = ("container_uri", "container_engine", "container_addopts") - args = [kwargs[key] for key in keys if key in kwargs] - return ContainerSpec(*args) + keys = ("uri", "engine", "addopts", "gpu_flavour") + kwargs = {k: kwargs[k2] for k in keys if (k2 := f"container_{k}") in kwargs} + return ContainerSpec(**kwargs) class ReferenceSpec(Protocol): @@ -226,7 +226,6 @@ def __init__( self.kwargs = executor_kwargs self.resources = resources # compute per node self.container = container - self.env_vars = env_vars or {} if self.use_gpu: msg = "" @@ -253,7 +252,7 @@ def __init__( # how long can individual tasks run (in seconds) if max_runtime is None: - # allow some margin for task cleanup TODO: pretty random + # allow some margin for task cleanup max_runtime = max(0.9 * self.lifetime, self.lifetime - 60) else: max_runtime = str_to_timedelta(max_runtime).seconds @@ -262,8 +261,8 @@ def __init__( warnings.warn(msg) self.max_runtime = max_runtime - # set default WQ resource specs TODO: type_hint - self.spec = None + # set default WQ resource specs + self.spec: dict | None = None if self.executor_type == "workqueue": self.spec = { "cores": self.kwargs["cores_per_task"], @@ -274,7 +273,21 @@ def __init__( } register_definition(definition=self) + # handle task environment variables # TODO: how to handle env variables? + if self.executor_type == "threadpool": + # disable thread affinity and busy-idling + default_env_vars = { + "OMP_PROC_BIND": "FALSE", + "OMP_WAIT_POLICY": "PASSIVE", + "OMP_NUM_THREADS": f"{self.cores_per_task}", + # "OMP_DISPLAY_ENV": "VERBOSE", # verbose OMP log + } + else: + # assert False, "IMPLEMENT THIS" + default_env_vars = {} + self.env_vars = default_env_vars | (env_vars or {}) + # TODO: check between min_runtime and max_runtime? pass @@ -301,6 +314,19 @@ def cores_per_task(self) -> int: # assumes all threads are working return int(self.resources["cores"] / self.kwargs["max_threads"]) + @property + def task_slots(self) -> int: + if self.executor_type == "threadpool": + return self.kwargs["max_threads"] + + slots = self.resources["cores"] // self.cores_per_task + gpu_slots, memory_slots = float("inf"), float("inf") + if self.use_gpu: + gpu_slots = self.resources["gpus"] // self.kwargs["gpus_per_task"] + if (mem_per_task := self.kwargs["mem_per_task"]) > 0: + memory_slots = self.resources["memory"] // mem_per_task + return min(slots, gpu_slots, memory_slots) + def wrap_in_timeout(self, command: str) -> str: if self.max_runtime == float("inf"): return command # noop @@ -337,19 +363,18 @@ def _create_workqueue(self, path: Path) -> WorkQueueExecutor: worker_executable = "work_queue_worker" if not isinstance(self, ReferenceEvaluation) and self.container: # ModelEvaluation / ModelTraining run in container themselves - # Reference instances launch tasks in container + # Reference launches tasks in container prepend = self.container.launch_command() worker_executable = f"{prepend} {worker_executable}" # TODO: why the custom WQ? -- does not seem necessary (anymore) - # executor = MyWorkQueueExecutor( executor = WorkQueueExecutor( label=self.name, working_dir=str(path / self.name), provider=self.provider, shared_fs=True, # autocategory=False, - # port=0, + port=0, # avoid multiple executors trying to use the same port # max_retries=1, # coprocess=False, worker_options=" ".join(worker_options), @@ -420,57 +445,79 @@ class ModelEvaluation(ExecutionDefinition): def __init__( self, timeout: float = 5.0, + max_resource_multiplier: int | None = None, + allow_oversubscription: bool = True, **kwargs, ): super().__init__(**kwargs) - self.timeout = timeout # i-Pi will kill client connections after no response for timeout seconds + if self.use_gpu and self.kwargs['gpus_per_task'] > 1: + # TODO: 'ConfigurationError' maybe? + raise ValueError("No Hamiltonian can do multi-GPU evaluation") - if self.executor_type == "threadpool": - # disable thread affinity and busy-idling - env_vars = { - "OMP_PROC_BIND": "FALSE", - "OMP_WAIT_POLICY": "PASSIVE", - "OMP_NUM_THREADS": f"{self.cores_per_task}", - # "OMP_DISPLAY_ENV": "VERBOSE", # verbose OMP log - } - else: - assert False, "IMPLEMENT THIS" - self.env_vars = env_vars | self.env_vars + # i-Pi will kill client connections after no response for timeout seconds + self.timeout = timeout + + # allow MD tasks to consume more computational resources based on walkers and hamiltonians + # but never more than available in a single resource block + if max_resource_multiplier is None: + max_resource_multiplier = self.task_slots + elif max_resource_multiplier > self.task_slots: + warnings.warn( + "Provided 'max_resource_multiplier' exceeds available task slots " + f"({max_resource_multiplier} -> {self.task_slots}). " + f"Limiting 'max_resource_multiplier'." + ) + max_resource_multiplier = self.task_slots + self.max_resource_multiplier = max_resource_multiplier + + # whether i-Pi clients are allowed to share cores/GPUs + self.allow_oversubscription = allow_oversubscription def server_command(self) -> str: command = "psiflow-server" return self.wrap_in_timeout(command) - def get_driver_devices(self, nwalkers: int) -> list[dict]: - # assumes driver is GPU capable - # TODO: what if only 1 gpu is available? Redo this - # nclients = min(nwalkers, self.max_workers) - nclients = min(nwalkers, 2) - if self.use_gpu: - return [{"device": f"cuda:{i}"} for i in range(nclients)] - else: - return [{"device": "cpu"} for _ in range(nclients)] + def get_driver_resources(self, n_walkers: int, n_drivers: int) -> list[dict]: + """Divide 'expensive' drivers over available resources.""" + n_clients = n_walkers * n_drivers + m = self.max_resource_multiplier + + if n_drivers > m and not self.allow_oversubscription: + # the combination of drivers does not fit on available resources + raise ValueError( + f"Simulation with {n_drivers} independent drivers not possible. " + f"Either increase 'max_resource_multiplier' or enable resource oversubscription." + ) + if n_clients > m and self.allow_oversubscription: + warnings.warn( + f"Simulation wants to employ {n_clients} clients, " + f"but can only use {m}x the per-client budget. " + f"Oversubscribing CPU/GPU resources." + ) + elif n_clients > m and not self.allow_oversubscription: + # limit total numer of clients so they do not fight over resources + n_clients = m - def wq_resources(self, nwalkers: int) -> dict: + # TODO: what if (n_clients % n_drivers != 0) + # you will have more copies of some drivers and fewer of others.. + # TODO: what if (n_clients % m != 0) + # you will have more clients on some GPUs than others + + if not self.use_gpu: + return [{"device": "cpu"} for _ in range(n_clients)] + return [{"device": f"cuda:{_ % m}"} for _ in range(n_clients)] + + def wq_resources(self, n_clients: int) -> dict: if self.spec is None: return {} # threadpool - # TODO: reimplement this - return self.spec - # def wq_resources(self, nwalkers): - # if self.use_threadpool: - # return {} - # nclients = min(nwalkers, self.max_workers) - # resource_specification = {} - # resource_specification["cores"] = nclients * self.cores_per_worker - # resource_specification["disk"] = 1000 # some random nontrivial amount? - # memory = 2000 * self.cores_per_worker # similarly rather random - # resource_specification["memory"] = int(memory) - # resource_specification["running_time_min"] = self.max_simulation_time - # if self.gpu: - # resource_specification["gpus"] = nclients - # return resource_specification + spec = self.spec.copy() + multi = min(n_clients, self.max_resource_multiplier) + spec["cores"] *= multi + spec["gpus"] *= multi + spec["memory"] *= multi + return spec class ModelTraining(ExecutionDefinition): @@ -654,6 +701,8 @@ def from_config( cls, parsl_log_level: str, default_threads: int, + garbage_collect: bool, + retries: int, **kwargs, ) -> "ExecutionContext": path = Path.cwd().resolve() / PSIFLOW_INTERNAL @@ -693,7 +742,11 @@ def from_config( executors.extend(internal) config = Config( - executors=executors, run_dir=str(path), initialize_logging=False + executors=executors, + run_dir=str(path), + initialize_logging=False, + garbage_collect=garbage_collect, + retries=retries, ) return ExecutionContext(config, definitions, path / "context_dir", **kwargs) @@ -827,10 +880,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): parsl_log_level: WARNING usage_tracking: 3 default_threads: 4 -max_idletime: 20 tmpdir_root: /tmp keep_tmpdirs: false -gpu_flavour: nvidia +container_engine: apptainer +garbage_collect: true +retries: 0 ModelEvaluation: executor: threadpool @@ -860,7 +914,7 @@ def create_bash_template(tmpdir_root: str, keep_tmpdirs: bool) -> str: # Create and move into new tmpdir for app execution tmpdir=$(mktemp -d -p {tmpdir_root} "psiflow-tmp.XXXXXXXXXX") cd $tmpdir; echo "tmpdir: $PWD" - export {{env}} + {{env}} printenv # Actual app definition goes here @@ -874,9 +928,33 @@ def create_bash_template(tmpdir_root: str, keep_tmpdirs: bool) -> str: def format_env_vars(env_vars: dict) -> str: - return " ".join([f"{k}={v}" for k, v in env_vars.items()]) + if len(env_vars) == 0: + return "" + return "export" + " ".join([f"{k}={v}" for k, v in env_vars.items()]) def str_to_timedelta(s: str) -> timedelta: t = datetime.strptime(s, "%H:%M:%S") - return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) \ No newline at end of file + return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) + + +def log_dfk_tasks(verbose: bool = False): + """Get an overview of all tasks stored in the parsl DFK. For debugging purposes.""" + dfk = parsl.dfk() + parsl.wait_for_current_tasks() + log = ["- Parsl task overview -"] + if not verbose: + log += [f"{i}\t{d['func_name']}" for i, d in dfk.tasks.items()] + log.append("- Parsl task overview -") + print(*log, sep="\n") + return + + for i, d in dfk.tasks.items(): + args = [(_.split("/")[-1] if isinstance(_, str) else _) for _ in d["args"]] + if "inputs" in (kwargs := d["kwargs"]): + kwargs["inputs"] = [f.filename for f in kwargs["inputs"]] + if "outputs" in kwargs: + kwargs["outputs"] = [f.filename for f in kwargs["outputs"]] + log.append(f"\n{i}\t{d['func_name']:<30}\n{args}\n{kwargs}") + log.append("- Parsl task overview -") + print(*log, sep="\n") diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index 3007c37..e37e1b5 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -1,6 +1,7 @@ import math import xml.etree.ElementTree as ET from dataclasses import dataclass +from itertools import cycle from typing import Optional, Union, Iterable import parsl @@ -8,6 +9,7 @@ from parsl.app.app import bash_app from parsl.data_provider.files import File from parsl.dataflow.futures import AppFuture, DataFuture +from sympy import print_glsl import psiflow from psiflow.data import Dataset @@ -362,6 +364,42 @@ def setup_smotion( return smotion +def define_clients_n_kwargs( + walkers: list[Walker], + components: list[HamiltonianComponent], + definition: "ModelEvaluation", + defaults: dict, +) -> tuple[list[dict], int]: + """Figure out i-Pi MD driver (force evaluator) configuration. + How many clients with which arguments on which resources?""" + + # separate hamiltonian components by computational cost + cheap, expensive = {}, {} + for i, comp in enumerate(components): + # the "idx" key corresponds with a serialized function in app inputs + if isinstance(comp.hamiltonian, MACEHamiltonian): + expensive[i] = comp + else: + cheap[i] = comp + + # cheap drivers only get a single client + cheap_kwargs = [] + for i, comp in cheap.items(): + cheap_kwargs.append(defaults | {"idx": i, "address": comp.address}) + + # expensive drivers are assigned to clients by ModelEvaluation + # TODO: currently there is no distinction between global MLPs (for every system) and + # bias MLPs (for a few systems in the total simulation), possibly leading to load balancing problems + n_systems = int(sum([w.nbeads for w in walkers])) + expensive_kwargs = definition.get_driver_resources(n_systems, len(expensive)) + driver_iterator = cycle(expensive.items()) + for kwargs, (i, comp) in zip(expensive_kwargs, driver_iterator): + # TODO: should dtype be configurable? + kwargs |= defaults | {"idx": i, "address": comp.address, "dtype": "float32"} + + return cheap_kwargs + expensive_kwargs, len(expensive_kwargs) + + def make_server_command( command: str, input_xml: File, @@ -392,6 +430,7 @@ def make_driver_commands( driver_kwargs: list[dict], file_xyz: File, files_hamiltonian: list[File] ) -> list[str]: """""" + # TODO: what if 'file_xyz' contains multiple geometries of different size? assert len(driver_kwargs) >= len(files_hamiltonian) default = f'i-pi-driver-py -u -S "" -m custom -P {PATH_DRIVER} -a {{address}} -o {{options}} &' @@ -548,29 +587,14 @@ def _sample( # app setup and IO context = psiflow.context() - definition = context.definitions["ModelEvaluation"] input_file = context.new_file("input_", ".xml") _save_xml(simulation, outputs=[input_file]) inputs = [ input_file, Dataset([w.state for w in walkers]).extxyz, + *[c.hamiltonian.serialize_function() for c in hamiltonian_components], ] - # figure out i-Pi MD driver configuration - # how many drivers (force evaluators) with which arguments? - # remove any Harmonic instances because they are not implemented with sockets -- TODO: why? - max_nclients = int(sum([w.nbeads for w in walkers])) - driver_kwargs = [] - for i, comp in enumerate(hamiltonian_components): - inputs.append(comp.hamiltonian.serialize_function()) - kwargs = {"idx": i, "address": comp.address, "max_force": max_force} - if isinstance(comp.hamiltonian, MACEHamiltonian): - kwargs["dtype"] = "float32" # TODO: should this be configurable? - for instance_kwargs in definition.get_driver_devices(max_nclients): - driver_kwargs.append(kwargs | instance_kwargs) - else: - driver_kwargs.append(kwargs) - outputs = [context.new_file("data_", ".xyz")] outputs += [context.new_file("simulation_", ".txt") for _ in walkers] if keep_trajectory: @@ -590,6 +614,11 @@ def _sample( else: coupling_copy_command = None + definition = context.definitions["ModelEvaluation"] + driver_kwargs, n_clients = define_clients_n_kwargs( + walkers, hamiltonian_components, definition, {"max_force": max_force} + ) + # TODO: an app to check for valid input? (e.g., PBC + barostat) result = execute_ipi( len(walkers), @@ -602,7 +631,7 @@ def _sample( bash_template=context.bash_template, inputs=inputs, outputs=outputs, - parsl_resource_specification=definition.wq_resources(max_nclients), + parsl_resource_specification=definition.wq_resources(n_clients), ) # process MD output From ad3e31eaa51d5b802d7951cb5c14a8095afd88db Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 10 Mar 2026 18:45:21 +0100 Subject: [PATCH 08/15] Update execution.py Update ModelTraining - this will be adapted when we update MACE etc. Update ReferenceEvaluation - updated memory_limit - allow creating Reference instances that ask for fewer cores than specified in ReferenceEvaluation (eliminating the need for CP2K/CP2K_small/...) --- psiflow/execution.py | 292 ++++++++++++++++++++----------------------- 1 file changed, 133 insertions(+), 159 deletions(-) diff --git a/psiflow/execution.py b/psiflow/execution.py index 854f020..a29f188 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -1,17 +1,25 @@ import logging -import re import shutil import sys import warnings import subprocess import textwrap +import inspect from datetime import datetime, timedelta from dataclasses import dataclass from pathlib import Path from threading import Lock - -# see https://stackoverflow.com/questions/59904631/python-class-constants-in-dataclasses -from typing import Any, Optional, Union, ClassVar, Protocol, Iterable, Sequence +from typing import ( + Any, + Optional, + Union, + ClassVar, + Protocol, + Iterable, + Sequence, + Callable, + TypeVar, +) import parsl import psutil @@ -29,11 +37,13 @@ from parsl.providers import LocalProvider, SlurmProvider from parsl.providers.base import ExecutionProvider +from psiflow.utils.config import PSIFLOW_INTERNAL, PARSL_LOGFILE, PSIFLOW_LOGFILE +from psiflow.utils.logging import setup_logging + logger = logging.getLogger(__name__) # logging per module -PSIFLOW_INTERNAL = "psiflow_internal" # TODO: move configuration files somewhere @dataclass @@ -41,7 +51,7 @@ class ContainerSpec: """Controls container configuration""" uri: str - engine: str + engine: str = "apptainer" addopts: str = " --no-eval -e --no-mount home -W /tmp --writable-tmpfs" gpu_flavour: str | None = None @@ -52,27 +62,20 @@ def __post_init__(self): def launch_command(self) -> str: pwd = Path.cwd().resolve() # access to data / internal dir - args = [self.engine, "exec", self.addopts, f"--bind {pwd}"] + args = [self.engine, "run", self.addopts, f"--bind {pwd}"] if self.gpu_flavour == "cuda": args.append("--nv") elif self.gpu_flavour == "rocm": args.append("--rocm") + args.append(self.uri) return " ".join(args) - @staticmethod - def from_kwargs(kwargs: dict) -> Optional["ContainerSpec"]: - if "container_uri" not in kwargs: - return None - keys = ("uri", "engine", "addopts", "gpu_flavour") - kwargs = {k: kwargs[k2] for k in keys if (k2 := f"container_{k}") in kwargs} - return ContainerSpec(**kwargs) - class ReferenceSpec(Protocol): """Defines default options for Reference implementations""" name: ClassVar[str] - reference_args: ClassVar[tuple[str, ...]] # TODO: update 'cores_per_worker' + reference_args: ClassVar[tuple[str, ...]] mpi_command: str mpi_args: Sequence[str] executable: str @@ -80,19 +83,14 @@ class ReferenceSpec(Protocol): def launch_command(self) -> str: raise NotImplementedError - @classmethod - def from_kwargs(cls, **kwargs): - keys = ("mpi_command", "mpi_args", "executable") - return cls(**{k: kwargs[k] for k in keys if k in kwargs}) - @dataclass class CP2KReferenceSpec(ReferenceSpec): name = "CP2K" - reference_args = ("cores_per_worker",) - mpi_command: str = "mpirun -np {cores_per_worker}" + reference_args = ("cores_per_task",) + mpi_command: str = "mpiexec -n {cores_per_task}" mpi_args: Sequence[str] = ( - "-ENV OMP_NUM_THREADS=1", + "-genv OMP_NUM_THREADS=1", "--bind-to core", "--map-by core", ) @@ -105,8 +103,8 @@ def launch_command(self): @dataclass class GPAWReferenceSpec(ReferenceSpec): name = "GPAW" - reference_args = ("cores_per_worker",) - mpi_command: str = "mpirun -np {cores_per_worker}" + reference_args = ("cores_per_task",) + mpi_command: str = "mpirun -np {cores_per_task}" mpi_args: Sequence[str] = ( "-x OMP_NUM_THREADS=1", "--bind-to core", @@ -260,6 +258,12 @@ def __init__( msg = "Allowed task runtime exceeds provider walltime. Tasks might get killed by the scheduler." warnings.warn(msg) self.max_runtime = max_runtime + if ( + self.executor_type == "workqueue" + and self.kwargs["min_runtime"] >= self.max_runtime + ): + msg = "Minimum task runtime exceeds maximum runtime. WQ might not not start tasks." + warnings.warn(msg) # set default WQ resource specs self.spec: dict | None = None @@ -273,24 +277,24 @@ def __init__( } register_definition(definition=self) - # handle task environment variables # TODO: how to handle env variables? + # disable thread affinity and busy-idling until we can isolate task resources + default_env_vars = { + "OMP_PROC_BIND": "FALSE", + "OMP_WAIT_POLICY": "PASSIVE", + "OMP_DISPLAY_ENV": "VERBOSE", # verbose OMP log + } if self.executor_type == "threadpool": - # disable thread affinity and busy-idling - default_env_vars = { - "OMP_PROC_BIND": "FALSE", - "OMP_WAIT_POLICY": "PASSIVE", - "OMP_NUM_THREADS": f"{self.cores_per_task}", - # "OMP_DISPLAY_ENV": "VERBOSE", # verbose OMP log - } + default_env_vars |= {"OMP_NUM_THREADS": f"{self.cores_per_task}"} else: - # assert False, "IMPLEMENT THIS" - default_env_vars = {} - self.env_vars = default_env_vars | (env_vars or {}) + # WQ sets OMP_NUM_THREADS itself + pass - # TODO: check between min_runtime and max_runtime? + # yaml parsing might un-stringify some keys + env_vars = {k: str(v).upper() for k, v in (env_vars or {}).items()} + self.env_vars = default_env_vars | (env_vars or {}) - pass + return @property def name(self) -> str: @@ -319,13 +323,13 @@ def task_slots(self) -> int: if self.executor_type == "threadpool": return self.kwargs["max_threads"] - slots = self.resources["cores"] // self.cores_per_task gpu_slots, memory_slots = float("inf"), float("inf") + cpu_slots = self.resources["cores"] // self.cores_per_task if self.use_gpu: gpu_slots = self.resources["gpus"] // self.kwargs["gpus_per_task"] if (mem_per_task := self.kwargs["mem_per_task"]) > 0: memory_slots = self.resources["memory"] // mem_per_task - return min(slots, gpu_slots, memory_slots) + return min(cpu_slots, gpu_slots, memory_slots) def wrap_in_timeout(self, command: str) -> str: if self.max_runtime == float("inf"): @@ -334,6 +338,13 @@ def wrap_in_timeout(self, command: str) -> str: # send SIGTERM after max_runtime, follow with SIGKILL 30s later return f"timeout -k 30s {self.max_runtime}s {command}" + def wrap_in_srun(self, command: str) -> str: + # TODO: stub -- this does not work + if self.provider is None: + return command # noop + + return f"srun -t 1 -c $CORES {command}" + def _create_threadpool(self, path: Path) -> ThreadPoolExecutor: max_threads = self.kwargs["max_threads"] return ThreadPoolExecutor(self.name, max_threads, working_dir=str(path)) @@ -345,11 +356,7 @@ def _create_workqueue(self, path: Path) -> WorkQueueExecutor: timeout = int(1e6) if self.resources["nodes"] > 1 else 20 cores = self.resources["cores"] - worker_options = [ - "--parent-death", - f"--cores={cores}", - f"--timeout={timeout}", - ] + worker_options = ["--parent-death", f"--cores={cores}", f"--timeout={timeout}"] if (memory := self.resources["memory"]) is not None: worker_options.append(f"--memory={memory * 1000}") # in MB if (lifetime := self.lifetime) != float("inf"): @@ -367,16 +374,12 @@ def _create_workqueue(self, path: Path) -> WorkQueueExecutor: prepend = self.container.launch_command() worker_executable = f"{prepend} {worker_executable}" - # TODO: why the custom WQ? -- does not seem necessary (anymore) executor = WorkQueueExecutor( label=self.name, working_dir=str(path / self.name), provider=self.provider, shared_fs=True, - # autocategory=False, port=0, # avoid multiple executors trying to use the same port - # max_retries=1, - # coprocess=False, worker_options=" ".join(worker_options), worker_executable=worker_executable, scaling_cores_per_worker=cores, @@ -410,11 +413,13 @@ def from_config( } elif executor == "workqueue": executor_kwargs = { - "cores_per_task": kwargs.get("cores_per_task", 0), + "cores_per_task": kwargs.get("cores_per_task", 1), "gpus_per_task": kwargs.get("gpus_per_task", 0), "mem_per_task": kwargs.get("mem_per_task", 0), } - assert any(v != 0 for v in executor_kwargs.values()) + assert ( + executor_kwargs["cores_per_task"] > 0 + ), "WQ needs at least one core to launch tasks" min_runtime = kwargs.get("min_runtime", "00:00:00") executor_kwargs["min_runtime"] = str_to_timedelta(min_runtime).seconds else: @@ -451,7 +456,7 @@ def __init__( ): super().__init__(**kwargs) - if self.use_gpu and self.kwargs['gpus_per_task'] > 1: + if self.use_gpu and self.kwargs["gpus_per_task"] > 1: # TODO: 'ConfigurationError' maybe? raise ValueError("No Hamiltonian can do multi-GPU evaluation") @@ -521,40 +526,23 @@ def wq_resources(self, n_clients: int) -> dict: class ModelTraining(ExecutionDefinition): - def __init__( - self, - multigpu: bool = False, # TODO: how to handle this? - **kwargs, - ) -> None: + def __init__(self, **kwargs) -> None: super().__init__(**kwargs) - self.multigpu = multigpu - if self.multigpu: - # TODO: why? Think this might be a multinode thing - which I do not care about - message = ( - "the max_training_time keyword does not work " - "in combination with multi-gpu training. Adjust " - "the maximum number of epochs to control the " - "duration of training" - ) - assert self.max_runtime is None, message if not self.use_gpu: warnings.warn( "ModelTraining is configured for CPU operation. Is this what you want?" ) - # default_env_vars = { - # "OMP_NUM_THREADS": str(self.cores_per_worker), - # "KMP_AFFINITY": "granularity=fine,compact,1,0", - # "KMP_BLOCKTIME": "1", - # "OMP_PROC_BIND": "spread", # different from Model Eval - # "PYTHONUNBUFFERED": "TRUE", - # } - # if env_vars is None: - # env_vars = default_env_vars - # else: - # default_env_vars.update(env_vars) - # env_vars = default_env_vars + # if self.multigpu: + # # TODO: why? Think this might be a multinode thing - which I do not care about + # message = ( + # "the max_training_time keyword does not work " + # "in combination with multi-gpu training. Adjust " + # "the maximum number of epochs to control the " + # "duration of training" + # ) + # assert self.max_runtime is None, message def train_command(self, initialize: bool = False): command = "psiflow-mace-train" @@ -563,78 +551,52 @@ def train_command(self, initialize: bool = False): def wq_resources(self, *args, **kwargs) -> dict: if self.spec is None: return {} # threadpool - # TODO: reimplement this - return self.spec - - # def wq_resources(self): - # if self.use_threadpool: - # return {} - # resource_specification = {} - # - # if self.multigpu: - # nworkers = int(self.cores_available / self.cores_per_worker) - # else: - # nworkers = 1 - # - # resource_specification["gpus"] = nworkers # one per GPU - # resource_specification["cores"] = self.cores_available - # resource_specification["disk"] = ( - # 1000 * nworkers - # ) # some random nontrivial amount? - # memory = 1000 * self.cores_available # similarly rather random - # resource_specification["memory"] = int(memory) - # resource_specification["running_time_min"] = self.max_training_time - # return resource_specification + return self.spec.copy() class ReferenceEvaluation(ExecutionDefinition): def __init__( self, - reference: "ReferenceSpec", - memory_limit: Optional[str] = None, # TODO: how does this work? + reference: ReferenceSpec, + memory_limit: Optional[float] = None, **kwargs, ) -> None: - # TODO: how to know which code? - # before super().__init__ because 'name' attribute needed - self.reference = reference super().__init__(**kwargs) - self.memory_limit = memory_limit + self.reference = reference + self.memory_limit = memory_limit # in GB + + if self.use_gpu: + warnings.warn("Reference calculations do not support GPU computation yet.") def command(self): - # TODO: this does not work probably - launch_command = self.spec.launch_command() - kwargs = {k: getattr(self, k) for k in self.spec.reference_args} - launch_command = launch_command.format(**kwargs) + command = self.reference.launch_command() + kwargs = {k: getattr(self, k) for k in self.reference.reference_args} + command = command.format(**kwargs) if self.container is not None: - launch_command = f"{self.container.launch_command()} {launch_command}" - - launch_command = self.wrap_in_timeout(launch_command) - - commands = [] - if self.memory_limit is not None: - # based on https://stackoverflow.com/a/42865957/2002471 - units = {"KB": 1, "MB": 2**10, "GB": 2**20, "TB": 2**30} - - def parse_size(size): # TODO: to utils? - size = size.upper() - if not re.match(r" ", size): - size = re.sub(r"([KMGT]?B)", r" \1", size) - number, unit = [string.strip() for string in size.split()] - return int(float(number) * units[unit]) + command = f"{self.container.launch_command()} {command}" + if (mem := self.memory_limit) is not None: + # set max RAM usage and disable swap storage - requires systemd-run + command = f"systemd-run --user --scope -p MemoryMax={mem}G -p MemorySwapMax=0 {command}" - commands.append(f"ulimit -v {parse_size(self.memory_limit)}") - - # exit code 0 so parsl always thinks bash app succeeded - return "\n".join([*commands, launch_command, "exit 0"]) + return self.wrap_in_timeout(command) - def wq_resources(self, *args, **kwargs) -> dict: + def wq_resources(self, n_cores: int | None) -> dict: if self.spec is None: return {} # threadpool - return self.spec + + fraction = 1 + if n_cores is not None: + fraction = n_cores / self.kwargs["cores_per_task"] + spec = self.spec.copy() + spec["cores"] = int(spec["cores"] * fraction) + spec["memory"] *= fraction + return spec @property def name(self) -> str: + if not hasattr(self, "reference"): + return super().name # during init return self.reference.name @@ -670,7 +632,6 @@ def __init__( # make sure task tmpdirs can be made Path(tmpdir_root).mkdir(parents=True, exist_ok=True) self.bash_template = create_bash_template(tmpdir_root, keep_tmpdirs) - self.file_index = {} self.lock = Lock() parsl.load(config) @@ -701,8 +662,6 @@ def from_config( cls, parsl_log_level: str, default_threads: int, - garbage_collect: bool, - retries: int, **kwargs, ) -> "ExecutionContext": path = Path.cwd().resolve() / PSIFLOW_INTERNAL @@ -711,27 +670,36 @@ def from_config( path.mkdir(parents=True) patch_parsl_dirtree() - log_file = str(path / "parsl.log") + # setup logging + log_file = str(path / PARSL_LOGFILE) log_level = getattr(logging, parsl_log_level) parsl.set_file_logger(filename=log_file, name="parsl", level=log_level) + setup_logging(file=path / PSIFLOW_LOGFILE) # TODO + + # default container for ModelEvaluation and ModelTraining + base_container = None + if "container" in kwargs: + base_container = make_cls(ContainerSpec, **kwargs["container"]) # create definitions - base_container = ContainerSpec.from_kwargs(kwargs) model_evaluation = ModelEvaluation.from_config( container=base_container, **kwargs["ModelEvaluation"] ) model_training = ModelTraining.from_config( container=base_container, **kwargs["ModelTraining"] ) - - reference_evaluations = [] # reference evaluations might be class specific - for key in list(kwargs.keys()): - if key[:4] in REFERENCE_SPECS: # allow for e.g., CP2K_small + reference_evaluations = [] # reference evaluations are class specific + for key, reference_cls in REFERENCE_SPECS.items(): + if key in kwargs: config = kwargs[key] + container = None + if "container" in config: + container = make_cls(ContainerSpec, **config.pop("container")) + reference = make_cls(reference_cls, **config) reference_evaluation = ReferenceEvaluation.from_config( - reference=REFERENCE_SPECS[key[:4]].from_kwargs(**config), - container=ContainerSpec.from_kwargs(kwargs | config), - **config, + reference=reference, + container=container, + **config, # make sure the container key is removed ) reference_evaluations.append(reference_evaluation) definitions = [model_evaluation, model_training, *reference_evaluations] @@ -741,12 +709,12 @@ def from_config( internal = make_default_executors(default_threads, path, base_container) executors.extend(internal) - config = Config( + config = make_cls( + Config, executors=executors, run_dir=str(path), initialize_logging=False, - garbage_collect=garbage_collect, - retries=retries, + **kwargs, ) return ExecutionContext(config, definitions, path / "context_dir", **kwargs) @@ -828,16 +796,10 @@ def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> s return x -# class MyWorkQueueExecutor(WorkQueueExecutor): -# # TODO: what does this do? -# def _get_launch_command(self, block_id): -# return self.worker_command - - # TODO: move everything below to appropriate files # TODO: attempt at managing priority through global state -WQ_RESOURCES_REGISTRY = {} +WQ_RESOURCES_REGISTRY = [] def register_definition(definition: ExecutionDefinition) -> None: @@ -845,7 +807,7 @@ def register_definition(definition: ExecutionDefinition) -> None: if (spec := definition.spec) is None: return # threadpool does not have priority - WQ_RESOURCES_REGISTRY[definition.name] = spec + WQ_RESOURCES_REGISTRY.append((definition.name, spec)) spec["priority"] = SetWQPriority.default @@ -863,14 +825,14 @@ def __init__(self, value: int, verbose: bool = False) -> None: def __enter__(self): if self.verbose: print(f"SetWQPriority setting priority:\t{self.value}") - for n, spec in WQ_RESOURCES_REGISTRY.items(): + for n, spec in WQ_RESOURCES_REGISTRY: spec["priority"] = self.value return self def __exit__(self, exc_type, exc_val, exc_tb): if self.verbose: print(f"SetWQPriority unsetting {self.value}") - for n, spec in WQ_RESOURCES_REGISTRY.items(): + for n, spec in WQ_RESOURCES_REGISTRY: spec["priority"] = SetWQPriority.default @@ -882,7 +844,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): default_threads: 4 tmpdir_root: /tmp keep_tmpdirs: false -container_engine: apptainer garbage_collect: true retries: 0 @@ -958,3 +919,16 @@ def log_dfk_tasks(verbose: bool = False): log.append(f"\n{i}\t{d['func_name']:<30}\n{args}\n{kwargs}") log.append("- Parsl task overview -") print(*log, sep="\n") + + +# TODO: after 3.12, this is no longer needed +# https://docs.python.org/3/library/typing.html +T = TypeVar("T") + + +def make_cls(cls: type[T], **kwargs: Any) -> T: + """Very simple class factory. Use introspection to filter args and kwargs.""" + sign = inspect.signature(cls) + argument_names = list(sign.parameters.keys()) + arguments = {k: kwargs[k] for k in argument_names if k in kwargs} + return cls(**arguments) From bb09d393945a1eb558ca1d4df2cd2f6222a62cc1 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 10 Mar 2026 18:46:53 +0100 Subject: [PATCH 09/15] updates and bugfixes - update modules to work with the new execution module and syntax - fix tests --- configs/local_test.yaml | 40 +++++++++++++++++----------------- psiflow/free_energy/phonons.py | 2 +- psiflow/reference/cp2k_.py | 30 +++++++++++-------------- psiflow/reference/dummy.py | 6 ++--- psiflow/reference/gpaw_.py | 14 ++++-------- psiflow/reference/orca_.py | 15 +++++-------- psiflow/reference/reference.py | 34 ++++++++++++++++++----------- psiflow/sampling/optimize.py | 2 +- psiflow/sampling/server.py | 2 +- psiflow/sampling/walker.py | 3 ++- psiflow/serialization.py | 7 +++--- 11 files changed, 75 insertions(+), 80 deletions(-) diff --git a/configs/local_test.yaml b/configs/local_test.yaml index 3f9dc2b..b44c2af 100644 --- a/configs/local_test.yaml +++ b/configs/local_test.yaml @@ -1,32 +1,32 @@ --- -parsl_log_level: WARNING -retries: 0 -make_symlinks: false - ModelEvaluation: - gpu: false - use_threadpool: false - max_simulation_time: 1 - + executor: threadpool + max_threads: 4 + max_runtime: 00:00:20 + ModelTraining: - gpu: true - use_threadpool: true - max_training_time: 1 - max_workers: 1 # suppress assertion for multigpu training + executor: threadpool + max_threads: 4 + max_runtime: 00:00:20 + CP2K: - cores_per_worker: 1 - max_evaluation_time: 0.1 - container_uri: 'oras://ghcr.io/molmod/cp2k:2024.1' + executor: workqueue + cores_per_task: 2 + max_runtime: 00:00:20 + memory_limit: 2 + container: + uri: docker://cp2k/cp2k:2025.2_mpich_x86_64_psmp GPAW: - cores_per_worker: 1 - max_evaluation_time: 0.1 - container_uri: 'oras://ghcr.io/molmod/gpaw:24.1' + executor: workqueue + cores_per_task: 2 + container: + uri: oras://ghcr.io/molmod/gpaw:24.1 ORCA: - cores_per_worker: 1 - max_evaluation_time: 0.1 + executor: workqueue + cores_per_task: 2 ... diff --git a/psiflow/free_energy/phonons.py b/psiflow/free_energy/phonons.py index 2c2513c..2f449ef 100644 --- a/psiflow/free_energy/phonons.py +++ b/psiflow/free_energy/phonons.py @@ -162,7 +162,7 @@ def compute_harmonic( inputs.append(comp.hamiltonian.serialize_function(dtype="float64")) kwargs = {"idx": i, "address": comp.address} if isinstance(comp.hamiltonian, MACEHamiltonian): - kwargs |= definition.get_driver_devices(1)[0] + kwargs |= definition.get_driver_resources(1, 1)[0] driver_kwargs.append(kwargs) result = execute_ipi( diff --git a/psiflow/reference/cp2k_.py b/psiflow/reference/cp2k_.py index eb30e73..4d707f8 100644 --- a/psiflow/reference/cp2k_.py +++ b/psiflow/reference/cp2k_.py @@ -1,7 +1,7 @@ import copy import io import warnings -from typing import Optional, Union +from typing import Optional, Union, ClassVar import numpy as np from ase.data import chemical_symbols @@ -56,15 +56,16 @@ def modify_input(input_dict: dict, properties: tuple) -> None: def parse_output(output_str: str, properties: tuple) -> dict[str, float | np.ndarray]: + """Very basic output parser. Perhaps check the cp2k-output-tools package?""" lines = output_str.split("\n") data = {} # output status - idx = find_line(lines, "CP2K", reverse=True, max_lines=250) + key = "SUBROUTINE" + idx = find_line(lines, key, reverse=True, max_lines=100) data["status"] = status = Status.SUCCESS if idx is not None else Status.FAILED if status == Status.SUCCESS: - # total runtime - data["runtime"] = float(lines[idx].split()[-1]) + data["runtime"] = float(lines[idx + 2].split()[-1]) # total runtime # find number of atoms idx = find_line(lines, "TOTAL NUMBERS AND MAXIMUM NUMBERS") @@ -77,7 +78,7 @@ def parse_output(output_str: str, properties: tuple) -> dict[str, float | np.nda data["positions"] = lines_to_array(lines[idx : idx + natoms], 4, 7) # read energy - key = "ENERGY| Total FORCE_EVAL ( QS ) energy [a.u.]" + key = "ENERGY| Total FORCE_EVAL ( QS ) energy" idx = find_line(lines, key, idx) data["energy"] = float(lines[idx].split()[-1]) * Ha @@ -85,28 +86,23 @@ def parse_output(output_str: str, properties: tuple) -> dict[str, float | np.nda return data # read forces - key = "ATOMIC FORCES in [a.u.]" - idx = find_line(lines, key, idx) + 3 - forces = lines_to_array(lines[idx : idx + natoms], 3) + key = "FORCES| Atomic forces" + idx = find_line(lines, key, idx) + 2 + forces = lines_to_array(lines[idx : idx + natoms], 2, 5) return data | {"forces": forces * Ha / Bohr} @psiflow.serializable class CP2K(Reference): + executor: ClassVar[str] = "CP2K" _execute_label = "cp2k_singlepoint" input_dict: dict - def __init__( - self, - input_str: str, - executor: str = "CP2K", - outputs: Union[tuple, list] = ("energy", "forces"), - ): - self.executor = executor - self.outputs = tuple(outputs) + def __init__(self, input_str: str, **kwargs): + super().__init__(**kwargs) self.input_dict = str_to_dict(input_str) - modify_input(self.input_dict, outputs) + modify_input(self.input_dict, self.outputs) self._create_apps() def compute_atomic_energy(self, element, box_size=None) -> AppFuture[float]: diff --git a/psiflow/reference/dummy.py b/psiflow/reference/dummy.py index f947efb..eeb2baa 100644 --- a/psiflow/reference/dummy.py +++ b/psiflow/reference/dummy.py @@ -12,16 +12,16 @@ @psiflow.serializable class ReferenceDummy(Reference): + executor = "HTEX" _execute_label = "dummy_singlepoint" - def __init__(self, outputs: Union[tuple, list] = ("energy", "forces")): - self.outputs = outputs + def __init__(self, **kwargs): + super().__init__(**kwargs) self._create_apps() def _create_apps(self): # psiflow.context().definitions does not contain "default_htex" self.execute_command = "" - self.app_pre = self.create_input self.app_execute = partial( bash_app(_execute, executors=["default_htex"]), reference=self, diff --git a/psiflow/reference/gpaw_.py b/psiflow/reference/gpaw_.py index a119271..e787abb 100644 --- a/psiflow/reference/gpaw_.py +++ b/psiflow/reference/gpaw_.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Optional, Union +from typing import Optional, Union, ClassVar from parsl import File from parsl.dataflow.futures import AppFuture @@ -34,22 +34,16 @@ def parse_output(stdout: str, properties: tuple[str, ...]) -> dict: @psiflow.serializable class GPAW(Reference): + executor: ClassVar[str] = "GPAW" _execute_label = "gpaw_singlepoint" parameters: dict script: str - def __init__( - self, - parameters: dict, - script: str | Path = FILEPATH, - outputs: Union[tuple, list] = ("energy", "forces"), - executor: str = "GPAW", - ): - self.outputs = tuple(outputs) + def __init__(self, parameters: dict, script: str | Path = FILEPATH, **kwargs): + super().__init__(**kwargs) self.parameters = parameters assert (script := Path(script)).is_file() self.script = str(script.resolve()) # absolute path - self.executor = executor self._create_apps() def compute_atomic_energy(self, element, box_size=None) -> AppFuture: diff --git a/psiflow/reference/orca_.py b/psiflow/reference/orca_.py index ca14115..509aa25 100644 --- a/psiflow/reference/orca_.py +++ b/psiflow/reference/orca_.py @@ -1,7 +1,7 @@ import warnings import re from functools import partial -from typing import Optional, Union +from typing import Optional, Union, ClassVar import ase.symbols import numpy as np @@ -121,20 +121,15 @@ def parse_output(stdout: str, properties: tuple[str, ...]) -> dict: @psiflow.serializable class ORCA(Reference): + executor: ClassVar[str] = "ORCA" _execute_label = "orca_singlepoint" input_template: str input_kwargs: dict - def __init__( - self, - input_template: str, - executor: str = "ORCA", - outputs: Union[tuple, list] = ("energy", "forces"), - ): - self.executor = executor - self.input_template = check_input(input_template, outputs) + def __init__(self, input_template: str, **kwargs): + super().__init__(**kwargs) + self.input_template = check_input(input_template, self.outputs) self.input_kwargs = DEFAULT_KWARGS.copy() # TODO: user control? - self.outputs = tuple(outputs) self._create_apps() def _create_apps(self): diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index 6a8c008..dc29ec2 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -93,6 +93,7 @@ def compute_dataset( def _execute( reference: Reference, inputs: list[File], + bash_template: str, parsl_resource_specification: Optional[dict] = None, stdout: str = parsl.AUTO_LOGNAME, stderr: str = parsl.AUTO_LOGNAME, @@ -100,8 +101,7 @@ def _execute( ) -> str: # TODO: we do not set env_vars here? command = reference.get_shell_command(inputs) - template = psiflow.context().bash_template - return template.format(commands=command, env='>/dev/null') + return bash_template.format(commands=command, env=">/dev/null") def _process_output( @@ -114,7 +114,7 @@ def _process_output( try: data = reference.parse_output(stdout) except LineNotFoundError: - # TODO: find out what went wrong + # TODO: find out what went wrong? data = {"status": Status.FAILED} data |= {"stdout": Path(inputs[0]), "stderr": Path(inputs[1])} return update_geometry(geom, data) @@ -123,10 +123,10 @@ def _process_output( @join_app def evaluate(reference: Reference, geom: Geometry) -> AppFuture[Geometry]: """""" - if geom == NullState: + if geom == NullState: # TODO: remove this warnings.warn("Skipping NullState..") return copy_app_future(geom) - execute, *files = reference.app_pre(geom=geom) + execute, *files = reference.create_input(geom=geom) if not execute: # TODO: should we reset geom? return copy_app_future(geom) future = reference.app_execute(inputs=files) @@ -138,14 +138,20 @@ def evaluate(reference: Reference, geom: Geometry) -> AppFuture[Geometry]: @psiflow.serializable class Reference(Computable): - outputs: Union[list[str], tuple[str, ...]] + outputs: Sequence[str] batch_size: ClassVar[int] = 1 # TODO: not really used - executor: str - app_pre: ClassVar[Callable] # TODO: fix serialisation - app_execute: ClassVar[Callable] + app_execute: ClassVar[Callable] # TODO: fix serialisation app_post: ClassVar[Callable] _execute_label: ClassVar[str] execute_command: str + executor: ClassVar[str] + n_cores: Optional[int] + + def __init__( + self, outputs: Sequence[str] = ("energy", "forces"), n_cores: int | None = None + ): + self.outputs: tuple[str, ...] = tuple(outputs) + self.n_cores = n_cores def compute( self, @@ -182,14 +188,16 @@ def compute_dataset(self, dataset: Dataset) -> Dataset: return Dataset(future) def _create_apps(self): - definition = psiflow.context().definitions[self.executor] + context = psiflow.context() + definition = context.definitions[self.executor] + if (n := self.n_cores) is not None: + assert n <= definition.spec["cores"] self.execute_command = definition.command() - wq_resources = definition.wq_resources() - self.app_pre = self.create_input self.app_execute = partial( bash_app(_execute, executors=[self.executor]), reference=self, - parsl_resource_specification=wq_resources, + bash_template=context.bash_template, + parsl_resource_specification=definition.wq_resources(n), label=self._execute_label, ) self.app_post = partial( diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index c8cd68c..cb531a4 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -179,7 +179,7 @@ def optimize( inputs.append(comp.hamiltonian.serialize_function(dtype="float64")) kwargs = {"idx": i, "address": comp.address} if isinstance(comp.hamiltonian, MACEHamiltonian): - kwargs |= definition.get_driver_devices(1)[0] + kwargs |= definition.get_driver_resources.get(1, 1)[0] driver_kwargs.append(kwargs) result = execute_ipi( diff --git a/psiflow/sampling/server.py b/psiflow/sampling/server.py index 6ade9fe..d0bc4e5 100644 --- a/psiflow/sampling/server.py +++ b/psiflow/sampling/server.py @@ -187,7 +187,7 @@ def main(): run(args.start_xyz, args.input_xml) softexit.trigger(status="success", message="@PSIFLOW: We are done here.") except ConnectionError: - # TODO: in this case, no output files are generated.. + # TODO: in this case, no output files are generated, so the task fails.. traceback.print_exc() softexit.trigger(status="bad", message="@PSIFLOW: Clients failed to connect.") except np.linalg.LinAlgError: diff --git a/psiflow/sampling/walker.py b/psiflow/sampling/walker.py index 9b2ce9c..42c711d 100644 --- a/psiflow/sampling/walker.py +++ b/psiflow/sampling/walker.py @@ -14,6 +14,7 @@ from psiflow.data import Dataset from psiflow.geometry import Geometry, check_equality from psiflow.hamiltonians import Hamiltonian, Zero, combine_hamiltonians + # from psiflow.order_parameters import OrderParameter from psiflow.sampling.metadynamics import Metadynamics from psiflow.utils.apps import copy_app_future @@ -68,7 +69,7 @@ def get_ensemble_kwargs(walker: "Walker") -> dict: @dataclass class Walker: start: Union[Geometry, AppFuture] - hamiltonian: Hamiltonian = Zero() + hamiltonian: Hamiltonian = field(default_factory=lambda: Zero()) timestep: float = 0.5 temperature: Optional[float] = 300 pressure: Optional[float] = None diff --git a/psiflow/serialization.py b/psiflow/serialization.py index 6e49038..ec455b0 100644 --- a/psiflow/serialization.py +++ b/psiflow/serialization.py @@ -4,6 +4,7 @@ import json from pathlib import Path from typing import ClassVar, Optional, Union, get_args, get_origin, get_type_hints +from collections.abc import Sequence from dataclasses import InitVar import typeguard @@ -81,18 +82,18 @@ def serializable(cls): origin = get_origin(type_hint) if origin is ClassVar: continue # do nothing for classvars - elif origin == dict: + elif origin in (dict, Sequence): kind = "attrs" elif isinstance(type_hint, str) and type_hint.startswith("dataclasses"): continue elif isinstance(type_hint, InitVar): continue - if kind is None and not inspect.isclass(type_hint): + elif kind is not None and not inspect.isclass(type_hint): raise ValueError( "{} is formally not a class ({})".format(type_hint, name) ) - if issubclass(type_hint, Serializable): + elif issubclass(type_hint, Serializable): kind = "serial" elif type_hint is Geometry: kind = "geoms" From 54089b0df6a4862090f570967f28b02ae03feb8e Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 10 Mar 2026 20:04:27 +0100 Subject: [PATCH 10/15] setup psiflow.log logging --- psiflow/utils/logging.py | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 psiflow/utils/logging.py diff --git a/psiflow/utils/logging.py b/psiflow/utils/logging.py new file mode 100644 index 0000000..0976590 --- /dev/null +++ b/psiflow/utils/logging.py @@ -0,0 +1,41 @@ +import logging +from pathlib import Path + +import parsl + + +def setup_logging(file: Path, level=logging.INFO) -> None: + """Setup the Psiflow parent logger""" + logger = logging.getLogger('psiflow') + logger.setLevel(level) + logger.propagate = False # do not propagate messages to root logger + + fh = logging.FileHandler(file) + formatter = logging.Formatter( + fmt='%(asctime)s %(name)s [%(levelname)s] %(message)s', + datefmt='%Y-%m-%d %H:%M' + ) + fh.setFormatter(formatter) + logger.addHandler(fh) + + +def log_dfk_tasks(verbose: bool = False): + """Get an overview of all tasks stored in the parsl DFK. For debugging purposes.""" + dfk = parsl.dfk() + parsl.wait_for_current_tasks() + log = ["- Parsl task overview -"] + if not verbose: + log += [f"{i}\t{d['func_name']}" for i, d in dfk.tasks.items()] + log.append("- Parsl task overview -") + print(*log, sep="\n") + return + + for i, d in dfk.tasks.items(): + args = [(_.split("/")[-1] if isinstance(_, str) else _) for _ in d["args"]] + if "inputs" in (kwargs := d["kwargs"]): + kwargs["inputs"] = [f.filename for f in kwargs["inputs"]] + if "outputs" in kwargs: + kwargs["outputs"] = [f.filename for f in kwargs["outputs"]] + log.append(f"\n{i}\t{d['func_name']:<30}\n{args}\n{kwargs}") + log.append("- Parsl task overview -") + print(*log, sep="\n") \ No newline at end of file From 029f26f27876b37291132dd64a3d7daf19799861 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 10 Mar 2026 20:06:01 +0100 Subject: [PATCH 11/15] cleanup action --- psiflow/execution.py | 193 ++++++++----------------------------- psiflow/models/model.py | 6 +- psiflow/sampling/ase.py | 4 +- psiflow/sampling/output.py | 8 +- psiflow/utils/_plumed.py | 5 - psiflow/utils/apps.py | 95 +++++++----------- psiflow/utils/config.py | 23 +++++ psiflow/utils/io.py | 11 +-- psiflow/utils/parse.py | 24 +++-- psiflow/utils/wq.py | 40 ++++++++ tests/test_reference.py | 14 ++- tests/test_sampling.py | 2 +- 12 files changed, 170 insertions(+), 255 deletions(-) create mode 100644 psiflow/utils/config.py create mode 100644 psiflow/utils/wq.py diff --git a/psiflow/execution.py b/psiflow/execution.py index a29f188..34b643d 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -1,25 +1,12 @@ import logging import shutil import sys -import warnings import subprocess -import textwrap import inspect -from datetime import datetime, timedelta from dataclasses import dataclass from pathlib import Path from threading import Lock -from typing import ( - Any, - Optional, - Union, - ClassVar, - Protocol, - Iterable, - Sequence, - Callable, - TypeVar, -) +from typing import Any, Optional, Union, ClassVar, Protocol, Sequence, TypeVar import parsl import psutil @@ -37,13 +24,24 @@ from parsl.providers import LocalProvider, SlurmProvider from parsl.providers.base import ExecutionProvider -from psiflow.utils.config import PSIFLOW_INTERNAL, PARSL_LOGFILE, PSIFLOW_LOGFILE +from psiflow.utils.config import ( + PSIFLOW_INTERNAL, + PARSL_LOGFILE, + PSIFLOW_LOGFILE, + DEFAULT_CONFIG, + CONTEXT_DIR, +) from psiflow.utils.logging import setup_logging +from psiflow.utils.wq import register_definition +from psiflow.utils.apps import create_bash_template +from psiflow.utils.parse import str_to_timedelta logger = logging.getLogger(__name__) # logging per module +class ConfigurationError(ValueError): + pass # some global psiflow configuration option does not make sense @dataclass @@ -149,7 +147,7 @@ def make_slurm_provider(kwargs: dict) -> tuple[SlurmProvider, dict]: resources = { "nodes": provider.nodes_per_block, "cores": provider.cores_per_node, - "memory": provider.mem_per_node, + "memory": provider.mem_per_node or float("inf"), "gpus": provider.gpus_per_node, "lifetime": str_to_timedelta(provider.walltime).seconds, } @@ -232,21 +230,20 @@ def __init__( elif container is not None and container.gpu_flavour is None: msg = "Provide container 'gpu_flavour' to choose between CUDA and ROCM" if msg: - raise ValueError(msg) + raise ConfigurationError(msg) if self.executor_type == "workqueue": - # WQ-specific checks TODO: check that WQ kwargs do not exceed resources? + # WQ-specific checks msg = "" if self.kwargs["gpus_per_task"] > resources["gpus"]: msg = "GPUs" if self.kwargs["cores_per_task"] > resources["cores"]: msg = "cores" - if self.kwargs["mem_per_task"] > (resources["memory"] or float("inf")): - # TODO: do we need memory=None anywhere? otherwise default to inf? + if self.kwargs["mem_per_task"] > resources["memory"]: msg = "memory" if msg: msg = f"Apps will request more {msg} than available per Parsl block" - raise ValueError(msg) + raise ConfigurationError(msg) # how long can individual tasks run (in seconds) if max_runtime is None: @@ -256,14 +253,14 @@ def __init__( max_runtime = str_to_timedelta(max_runtime).seconds if max_runtime != float("inf") and max_runtime >= self.lifetime: msg = "Allowed task runtime exceeds provider walltime. Tasks might get killed by the scheduler." - warnings.warn(msg) + logger.warning(msg) self.max_runtime = max_runtime if ( self.executor_type == "workqueue" and self.kwargs["min_runtime"] >= self.max_runtime ): msg = "Minimum task runtime exceeds maximum runtime. WQ might not not start tasks." - warnings.warn(msg) + logger.warning(msg) # set default WQ resource specs self.spec: dict | None = None @@ -338,10 +335,10 @@ def wrap_in_timeout(self, command: str) -> str: # send SIGTERM after max_runtime, follow with SIGKILL 30s later return f"timeout -k 30s {self.max_runtime}s {command}" - def wrap_in_srun(self, command: str) -> str: - # TODO: stub -- this does not work - if self.provider is None: - return command # noop + # def wrap_in_srun(self, command: str) -> str: + # # TODO: stub -- this does not work + # if self.provider is None: + # return command # noop return f"srun -t 1 -c $CORES {command}" @@ -397,8 +394,8 @@ def wq_resources(self, *args, **kwargs) -> dict: @classmethod def from_config( cls, - executor: str, # TODO: no default value? - container: Optional[ContainerSpec], + executor: str = "workqueue", + container: Optional[ContainerSpec] = None, **kwargs, ): if executor == "threadpool": @@ -423,7 +420,9 @@ def from_config( min_runtime = kwargs.get("min_runtime", "00:00:00") executor_kwargs["min_runtime"] = str_to_timedelta(min_runtime).seconds else: - raise ValueError("Key 'executor' must be 'threadpool' or 'workqueue'") + raise ConfigurationError( + "Key 'executor' must be 'threadpool' or 'workqueue'" + ) # search for Parsl ExecutionProvider block, defaulting to "local" if "slurm" in kwargs: @@ -457,8 +456,7 @@ def __init__( super().__init__(**kwargs) if self.use_gpu and self.kwargs["gpus_per_task"] > 1: - # TODO: 'ConfigurationError' maybe? - raise ValueError("No Hamiltonian can do multi-GPU evaluation") + raise ConfigurationError("No Hamiltonian can do multi-GPU evaluation") # i-Pi will kill client connections after no response for timeout seconds self.timeout = timeout @@ -468,7 +466,7 @@ def __init__( if max_resource_multiplier is None: max_resource_multiplier = self.task_slots elif max_resource_multiplier > self.task_slots: - warnings.warn( + logger.warning( "Provided 'max_resource_multiplier' exceeds available task slots " f"({max_resource_multiplier} -> {self.task_slots}). " f"Limiting 'max_resource_multiplier'." @@ -490,12 +488,12 @@ def get_driver_resources(self, n_walkers: int, n_drivers: int) -> list[dict]: if n_drivers > m and not self.allow_oversubscription: # the combination of drivers does not fit on available resources - raise ValueError( + raise ConfigurationError( f"Simulation with {n_drivers} independent drivers not possible. " f"Either increase 'max_resource_multiplier' or enable resource oversubscription." ) if n_clients > m and self.allow_oversubscription: - warnings.warn( + logger.warning( f"Simulation wants to employ {n_clients} clients, " f"but can only use {m}x the per-client budget. " f"Oversubscribing CPU/GPU resources." @@ -530,7 +528,7 @@ def __init__(self, **kwargs) -> None: super().__init__(**kwargs) if not self.use_gpu: - warnings.warn( + logger.warning( "ModelTraining is configured for CPU operation. Is this what you want?" ) @@ -566,7 +564,7 @@ def __init__( self.memory_limit = memory_limit # in GB if self.use_gpu: - warnings.warn("Reference calculations do not support GPU computation yet.") + logger.warning("Reference calculations do not support GPU computation yet.") def command(self): command = self.reference.launch_command() @@ -661,6 +659,7 @@ def new_file(self, prefix: str, suffix: str) -> File: def from_config( cls, parsl_log_level: str, + psiflow_log_level: str, default_threads: int, **kwargs, ) -> "ExecutionContext": @@ -674,7 +673,7 @@ def from_config( log_file = str(path / PARSL_LOGFILE) log_level = getattr(logging, parsl_log_level) parsl.set_file_logger(filename=log_file, name="parsl", level=log_level) - setup_logging(file=path / PSIFLOW_LOGFILE) # TODO + setup_logging(file=path / PSIFLOW_LOGFILE, level=psiflow_log_level) # default container for ModelEvaluation and ModelTraining base_container = None @@ -716,7 +715,7 @@ def from_config( initialize_logging=False, **kwargs, ) - return ExecutionContext(config, definitions, path / "context_dir", **kwargs) + return ExecutionContext(config, definitions, path / CONTEXT_DIR, **kwargs) class ExecutionContextLoader: @@ -796,67 +795,6 @@ def __call__(self, command: str, tasks_per_node: int, nodes_per_block: int) -> s return x -# TODO: move everything below to appropriate files - -# TODO: attempt at managing priority through global state -WQ_RESOURCES_REGISTRY = [] - - -def register_definition(definition: ExecutionDefinition) -> None: - """""" - if (spec := definition.spec) is None: - return # threadpool does not have priority - - WQ_RESOURCES_REGISTRY.append((definition.name, spec)) - spec["priority"] = SetWQPriority.default - - -class SetWQPriority: - """Manage the WQ priority tag as context manager""" - - # TODO: this probably does not work in a nested way - # TODO: log to parsl.log? - default = 0 - - def __init__(self, value: int, verbose: bool = False) -> None: - self.value = value - self.verbose = verbose - - def __enter__(self): - if self.verbose: - print(f"SetWQPriority setting priority:\t{self.value}") - for n, spec in WQ_RESOURCES_REGISTRY: - spec["priority"] = self.value - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.verbose: - print(f"SetWQPriority unsetting {self.value}") - for n, spec in WQ_RESOURCES_REGISTRY: - spec["priority"] = SetWQPriority.default - - -# This is the default psiflow config which is always passed into the ExecutionContext -# TODO: find a place for this -DEFAULT_CONFIG = """ -parsl_log_level: WARNING -usage_tracking: 3 -default_threads: 4 -tmpdir_root: /tmp -keep_tmpdirs: false -garbage_collect: true -retries: 0 - -ModelEvaluation: - executor: threadpool - max_threads: 2 - -ModelTraining: - executor: threadpool - max_threads: 2 -""" - - def patch_parsl_dirtree() -> None: """By default, Parsl will put Executor logs etc. under numbered directories. We do not need this level of nesting, as psiflow_internal is refreshed every run""" @@ -866,61 +804,6 @@ def patch_parsl_dirtree() -> None: parsl.dataflow.dflow.make_rundir = lambda x: x -# TODO: arguments that need documenting: retries, strategy?, timeout, garbage_collect (Config) - - -def create_bash_template(tmpdir_root: str, keep_tmpdirs: bool) -> str: - """Create general wrapper for all bash apps. The exitcode ensures that every app completes successfully.""" - template = f""" - # Create and move into new tmpdir for app execution - tmpdir=$(mktemp -d -p {tmpdir_root} "psiflow-tmp.XXXXXXXXXX") - cd $tmpdir; echo "tmpdir: $PWD" - {{env}} - printenv - - # Actual app definition goes here - {{commands}} - - # Cleanup - {'cd ../.. && rm -r $tmpdir' if not keep_tmpdirs else ''} - exit 0 - """ - return textwrap.dedent(template) - - -def format_env_vars(env_vars: dict) -> str: - if len(env_vars) == 0: - return "" - return "export" + " ".join([f"{k}={v}" for k, v in env_vars.items()]) - - -def str_to_timedelta(s: str) -> timedelta: - t = datetime.strptime(s, "%H:%M:%S") - return timedelta(hours=t.hour, minutes=t.minute, seconds=t.second) - - -def log_dfk_tasks(verbose: bool = False): - """Get an overview of all tasks stored in the parsl DFK. For debugging purposes.""" - dfk = parsl.dfk() - parsl.wait_for_current_tasks() - log = ["- Parsl task overview -"] - if not verbose: - log += [f"{i}\t{d['func_name']}" for i, d in dfk.tasks.items()] - log.append("- Parsl task overview -") - print(*log, sep="\n") - return - - for i, d in dfk.tasks.items(): - args = [(_.split("/")[-1] if isinstance(_, str) else _) for _ in d["args"]] - if "inputs" in (kwargs := d["kwargs"]): - kwargs["inputs"] = [f.filename for f in kwargs["inputs"]] - if "outputs" in kwargs: - kwargs["outputs"] = [f.filename for f in kwargs["outputs"]] - log.append(f"\n{i}\t{d['func_name']:<30}\n{args}\n{kwargs}") - log.append("- Parsl task overview -") - print(*log, sep="\n") - - # TODO: after 3.12, this is no longer needed # https://docs.python.org/3/library/typing.html T = TypeVar("T") diff --git a/psiflow/models/model.py b/psiflow/models/model.py index 6e11b04..73c9cb1 100644 --- a/psiflow/models/model.py +++ b/psiflow/models/model.py @@ -1,5 +1,6 @@ from __future__ import annotations # necessary for type-guarding class methods +import logging from dataclasses import asdict from pathlib import Path from typing import Optional, Union @@ -11,10 +12,11 @@ import psiflow from psiflow.data import Dataset -from psiflow.utils.apps import copy_data_future, log_message, setup_logger +from psiflow.utils.apps import copy_data_future, log_message from psiflow.utils.io import save_yaml -logger = setup_logger(__name__) + +logger = logging.getLogger(__name__) @typeguard.typechecked diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py index e039791..4775c1a 100644 --- a/psiflow/sampling/ase.py +++ b/psiflow/sampling/ase.py @@ -1,3 +1,4 @@ +import logging from typing import Optional, Union import parsl @@ -9,7 +10,6 @@ from psiflow.data.utils import write_frames from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian -from psiflow.utils.apps import setup_logger from psiflow.utils.io import _dump_json from psiflow.utils.parse import get_task_name_id from psiflow.execution import format_env_vars @@ -17,7 +17,7 @@ from ._ase import ALLOWED_MODES, __file__ as file_ase DEFAULT_EXECUTABLE = "script.py" -logger = setup_logger(__name__) # logging per module +logger = logging.getLogger(__name__) class OptimisationFailedError(Exception): diff --git a/psiflow/sampling/output.py b/psiflow/sampling/output.py index 318a86c..7d74aec 100644 --- a/psiflow/sampling/output.py +++ b/psiflow/sampling/output.py @@ -1,5 +1,4 @@ -import copy -import re +import logging from enum import Enum from pathlib import Path from dataclasses import dataclass, field, InitVar @@ -13,15 +12,14 @@ import psiflow from psiflow.data import Dataset -from psiflow.geometry import Geometry, NullState +from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian, MixtureHamiltonian, Zero from psiflow.sampling.walker import Walker from psiflow.utils.io import save_npz -from psiflow.utils.apps import setup_logger from psiflow.utils.parse import get_task_name_id -logger = setup_logger(__name__) # logging per module +logger = logging.getLogger(__name__) DEFAULT_OBSERVABLES = [ diff --git a/psiflow/utils/_plumed.py b/psiflow/utils/_plumed.py index 89b6789..611bdc8 100644 --- a/psiflow/utils/_plumed.py +++ b/psiflow/utils/_plumed.py @@ -1,10 +1,7 @@ import logging import os -import typeguard - -@typeguard.typechecked def try_manual_plumed_linking() -> str: if "PLUMED_KERNEL" not in os.environ.keys(): # try linking manually @@ -23,7 +20,6 @@ def try_manual_plumed_linking() -> str: return os.environ["PLUMED_KERNEL"] -@typeguard.typechecked def remove_comments_printflush(plumed_input: str) -> str: new_input = [] for line in list(plumed_input.split("\n")): @@ -38,7 +34,6 @@ def remove_comments_printflush(plumed_input: str) -> str: return "\n".join(new_input) -@typeguard.typechecked def set_path_in_plumed(plumed_input: str, keyword: str, path_to_set: str) -> str: lines = plumed_input.split("\n") for i, line in enumerate(lines): diff --git a/psiflow/utils/apps.py b/psiflow/utils/apps.py index 0f52bdd..894619f 100644 --- a/psiflow/utils/apps.py +++ b/psiflow/utils/apps.py @@ -1,23 +1,20 @@ -from __future__ import annotations # necessary for type-guarding class methods - -import logging -import sys +import shutil +import textwrap from typing import Any, Union +from pathlib import Path import numpy as np -import typeguard -from parsl.app.app import python_app +from parsl import python_app from parsl.data_provider.files import File -@typeguard.typechecked def get_attribute(obj: Any, *attribute_names: str) -> Any: + # TODO: not an app for name in attribute_names: obj = getattr(obj, name) return obj -@typeguard.typechecked def _boolean_or(*args: Union[bool, np.bool_]) -> bool: return any(args) @@ -32,27 +29,6 @@ def _multiply(a, b): multiply = python_app(_multiply, executors=["default_threads"]) -@typeguard.typechecked -def setup_logger(module_name: str, level=logging.INFO) -> logging.Logger: - # Create logger instance for the module - module_logger = logging.getLogger(module_name) - - # Set the desired format string - formatter = logging.Formatter("%(name)s - %(message)s") - - # Create handler to send logs to stdout - stdout_handler = logging.StreamHandler(sys.stdout) - stdout_handler.setFormatter(formatter) - - # Add handler to the logger instance - module_logger.addHandler(stdout_handler) - - # Set the logging level for the logger - module_logger.setLevel(level) - - return module_logger - - def _compute_sum(a, b): return np.add(a, b) @@ -60,37 +36,25 @@ def _compute_sum(a, b): compute_sum = python_app(_compute_sum, executors=["default_threads"]) -@typeguard.typechecked -def _combine_futures(inputs: list[Any]) -> list[Any]: - return list(inputs) - - -combine_futures = python_app(_combine_futures, executors=["default_threads"]) - - -@typeguard.typechecked def _copy_data_future( pass_on_exist: bool = False, inputs: list[File] = [], outputs: list[File] = [], ) -> None: - import shutil - from pathlib import Path - assert len(inputs) == 1 assert len(outputs) == 1 if Path(outputs[0]).is_file() and pass_on_exist: - return None - if Path(inputs[0]).is_file(): + pass + elif Path(inputs[0]).is_file(): shutil.copyfile(inputs[0], outputs[0]) else: # no need to copy empty file pass + return copy_data_future = python_app(_copy_data_future, executors=["default_threads"]) -@typeguard.typechecked def _copy_app_future(future: Any, inputs: list = [], outputs: list = []) -> Any: # inputs/outputs to enforce additional dependencies from copy import deepcopy @@ -101,7 +65,6 @@ def _copy_app_future(future: Any, inputs: list = [], outputs: list = []) -> Any: copy_app_future = python_app(_copy_app_future, executors=["default_threads"]) -@typeguard.typechecked def _log_message(logger, message, *futures): if len(futures) > 0: logger.info(message.format(*futures)) @@ -112,23 +75,11 @@ def _log_message(logger, message, *futures): log_message = python_app(_log_message, executors=["default_threads"]) -def _pack(*args): - return args # TODO: _combine_futures? - - -pack = python_app(_pack, executors=["default_threads"]) - - -@typeguard.typechecked -def _unpack_i(result: Union[np.ndarray, list, tuple], i: int) -> Any: - assert i <= len(result) - return result[i] - - -unpack_i = python_app(_unpack_i, executors=["default_threads"]) +@python_app(executors=["default_threads"]) +def pack(*args: Any) -> tuple[Any]: + return args -@typeguard.typechecked def _concatenate(*arrays: np.ndarray) -> np.ndarray: return np.concatenate(arrays) @@ -136,9 +87,31 @@ def _concatenate(*arrays: np.ndarray) -> np.ndarray: concatenate = python_app(_concatenate, executors=["default_threads"]) -@typeguard.typechecked def _isnan(a: Union[float, np.ndarray]) -> bool: return bool(np.any(np.isnan(a))) isnan = python_app(_isnan, executors=["default_threads"]) + + +def create_bash_template(tmpdir_root: str, keep_tmpdirs: bool) -> str: + """Create general wrapper for all bash apps. The exitcode ensures that every app completes successfully.""" + template = f""" + # Create and move into new tmpdir for app execution + tmpdir=$(mktemp -d -p {tmpdir_root} "psiflow-tmp.XXXXXXXXXX") + cd $tmpdir; echo "tmpdir: $PWD" + {{env}} + printenv + + # Actual app definition goes here + {{commands}} + + # Cleanup + {'cd ../.. && rm -r $tmpdir' if not keep_tmpdirs else ''} + exit 0 + """ + return textwrap.dedent(template) + + +combine_futures = None +unpack_i = None diff --git a/psiflow/utils/config.py b/psiflow/utils/config.py new file mode 100644 index 0000000..14be6b1 --- /dev/null +++ b/psiflow/utils/config.py @@ -0,0 +1,23 @@ +PSIFLOW_INTERNAL = "psiflow_internal" +PARSL_LOGFILE = "parsl.log" +PSIFLOW_LOGFILE = "psiflow.log" +CONTEXT_DIR = "context_dir" + + +DEFAULT_CONFIG = """ +parsl_log_level: WARNING +psiflow_log_level: WARNING +usage_tracking: 3 +default_threads: 4 +tmpdir_root: /tmp +keep_tmpdirs: false + +ModelEvaluation: + executor: threadpool + max_threads: 2 + +ModelTraining: + executor: threadpool + max_threads: 2 +""" + diff --git a/psiflow/utils/io.py b/psiflow/utils/io.py index c5e9385..2123429 100644 --- a/psiflow/utils/io.py +++ b/psiflow/utils/io.py @@ -3,12 +3,10 @@ from typing import Any import numpy as np -import typeguard from parsl.app.app import python_app from parsl.data_provider.files import File -@typeguard.typechecked def _save_yaml( input_dict: dict, outputs: list[File] = [], @@ -39,7 +37,6 @@ def _make_dict_safe(arg): save_yaml = python_app(_save_yaml, executors=["default_threads"]) -@typeguard.typechecked def _save_xml( element: ET.Element, outputs: list = [], @@ -52,7 +49,6 @@ def _save_xml( save_xml = python_app(_save_xml, executors=["default_threads"]) -@typeguard.typechecked def _load_numpy(inputs: list[File] = [], **kwargs) -> np.ndarray: return np.loadtxt(inputs[0], **kwargs) @@ -60,8 +56,7 @@ def _load_numpy(inputs: list[File] = [], **kwargs) -> np.ndarray: load_numpy = python_app(_load_numpy, executors=["default_threads"]) -@typeguard.typechecked -def _read_yaml(inputs: list[File] = [], outputs: list[File] = []) -> dict: +def _read_yaml(inputs: list[File] = []) -> dict: import yaml with open(inputs[0], "r") as f: @@ -72,7 +67,6 @@ def _read_yaml(inputs: list[File] = [], outputs: list[File] = []) -> dict: read_yaml = python_app(_read_yaml, executors=["default_threads"]) -@typeguard.typechecked def _save_txt(data: str, outputs: list[File] = []) -> None: with open(outputs[0], "w") as f: f.write(data) @@ -81,7 +75,6 @@ def _save_txt(data: str, outputs: list[File] = []) -> None: save_txt = python_app(_save_txt, executors=["default_threads"]) -@typeguard.typechecked def _load_metrics(inputs: list = []) -> np.recarray: # TODO: stop using recarrays return np.load(inputs[0], allow_pickle=True) @@ -90,7 +83,6 @@ def _load_metrics(inputs: list = []) -> np.recarray: load_metrics = python_app(_load_metrics, executors=["default_threads"]) -@typeguard.typechecked def _save_metrics(data: np.recarray, outputs: list = []) -> None: # TODO: stop using recarrays with open(outputs[0], "wb") as f: @@ -100,7 +92,6 @@ def _save_metrics(data: np.recarray, outputs: list = []) -> None: save_metrics = python_app(_save_metrics, executors=["default_threads"]) -@typeguard.typechecked def _dump_json( inputs: list = [], outputs: list = [], diff --git a/psiflow/utils/parse.py b/psiflow/utils/parse.py index 99fbcd7..9f265d1 100644 --- a/psiflow/utils/parse.py +++ b/psiflow/utils/parse.py @@ -7,9 +7,7 @@ class LineNotFoundError(Exception): - """Call to find_line failed""" - - pass + pass # call to find_line failed def find_line( @@ -24,7 +22,8 @@ def find_line( idx_slice = slice(idx_start, idx_start + max_lines) else: idx_start = idx_start or len(lines) - 1 - idx_slice = slice(idx_start, idx_start - max_lines, -1) + idx_stop = max(idx_start - max_lines, 0) + idx_slice = slice(idx_start, idx_stop, -1) for i, l in enumerate(lines[idx_slice]): if l.strip().startswith(line): if not reverse: @@ -41,8 +40,12 @@ def lines_to_array( return np.array([line.split()[start:stop] for line in lines], dtype=dtype) -def string_to_timedelta(timedelta: str) -> datetime.timedelta: +def str_to_timedelta(s: str) -> datetime.timedelta: """""" + # TODO: this will probably not work in general + time = datetime.datetime.strptime(s, "%H:%M:%S") + return datetime.timedelta(hours=time.hour, minutes=time.minute, seconds=time.second) + allowed_units = "weeks", "days", "hours", "minutes", "seconds" time_list = timedelta.split() values, units = time_list[:-1:2], time_list[1::2] @@ -50,9 +53,10 @@ def string_to_timedelta(timedelta: str) -> datetime.timedelta: return datetime.timedelta(**kwargs) + def get_task_logs(task_id: int) -> tuple[Path, Path]: """""" - path = Path.cwd().resolve() / PSIFLOW_INTERNAL / "000/task_logs" # TODO + path = Path.cwd().resolve() / PSIFLOW_INTERNAL / "task_logs" stdout = next(path.rglob(f"task_{task_id}_*.stdout")) stderr = next(path.rglob(f"task_{task_id}_*.stderr")) return stdout, stderr @@ -61,3 +65,11 @@ def get_task_logs(task_id: int) -> tuple[Path, Path]: def get_task_name_id(logfile: str) -> tuple[str, str]: _, task_id, task_name = Path(logfile).stem.split("_", maxsplit=2) return task_name, task_id + + +def format_env_vars(env_vars: dict) -> str: + if len(env_vars) == 0: + return "" + return "export" + " ".join([f"{k}={v}" for k, v in env_vars.items()]) + + diff --git a/psiflow/utils/wq.py b/psiflow/utils/wq.py new file mode 100644 index 0000000..4ecc01d --- /dev/null +++ b/psiflow/utils/wq.py @@ -0,0 +1,40 @@ +# TODO: this probably does not work in a nested way + +import logging + + +logger = logging.getLogger(__name__) # logging per module + + +WQ_RESOURCES_REGISTRY = [] + + +def register_definition(definition: 'ExecutionDefinition') -> None: + """""" + if (spec := definition.spec) is None: + return # threadpool does not have priority + + WQ_RESOURCES_REGISTRY.append((definition.name, spec)) + spec["priority"] = SetWQPriority.default + + +class SetWQPriority: + """Manage the WQ priority tag as context manager""" + default = 0 + + def __init__(self, value: int, verbose: bool = False) -> None: + self.value = value + self.verbose = verbose + + def __enter__(self): + if self.verbose: + logger.info(f"SetWQPriority setting priority:\t{self.value}") + for n, spec in WQ_RESOURCES_REGISTRY: + spec["priority"] = self.value + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.verbose: + logger.info(f"SetWQPriority unsetting {self.value}") + for n, spec in WQ_RESOURCES_REGISTRY: + spec["priority"] = SetWQPriority.default diff --git a/tests/test_reference.py b/tests/test_reference.py index 8fa5b09..a1be636 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -146,11 +146,10 @@ def test_cp2k_parse_output(): ENERGY| Total FORCE_EVAL ( QS ) energy [a.u.]: -14.202993407031412 - ATOMIC FORCES in [a.u.] - - # Atom Kind Element X Y Z - 1 1 O 0.00000000 0.00000000 0.00000000 - SUM OF ATOMIC FORCES 0.00000000 0.00000000 0.00000000 0.00000000 + FORCES| Atomic forces [hartree/bohr] + FORCES| Atom x y z |f| + FORCES| 1 0 0 0 0 + FORCES| Sum 0.00000000 0.00000000 0.00000000 0.00000000 STRESS| Analytical stress tensor [GPa] STRESS| x y z @@ -169,7 +168,7 @@ def test_cp2k_parse_output(): ### SKIPPED A BIT ### - ------------------------------------------------------------------------------- + ------------------------------------------------------------------------------- - - - T I M I N G - - - @@ -223,7 +222,7 @@ def test_cp2k_success(simple_cp2k_input, geom_h2_p): if "Number of threads for this process" in line: nthreads = int(line.split()[-1]) definition = psiflow.context().definitions["CP2K"] - ncores = definition.cores_per_worker + ncores = definition.cores_per_task assert ncores == nprocesses assert 1 == nthreads @@ -306,7 +305,6 @@ def test_cp2k_failure(geom_h2_p): def test_cp2k_memory(simple_cp2k_input): - # TODO: test_cp2k_memory == test_cp2k_timeout until memory constraints work reference = CP2K(simple_cp2k_input) geometry = Geometry.from_data( numbers=np.ones(4000), diff --git a/tests/test_sampling.py b/tests/test_sampling.py index 7d955c6..a502a6a 100644 --- a/tests/test_sampling.py +++ b/tests/test_sampling.py @@ -317,7 +317,7 @@ def test_output_status(dataset): # walltime definition = psiflow.context().definitions["ModelEvaluation"] - definition.max_simulation_time = 5 / 60 # 5 seconds + definition.max_runtime = 5 # seconds outputs = sample([walker], steps=10000) assert outputs[0].status.result() == Status.TIMEOUT assert outputs[0].time.result() > 0 From f9f7d3371b56f5f260f0a1bdcd626788bbdc1440 Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Tue, 10 Mar 2026 21:26:33 +0100 Subject: [PATCH 12/15] small cleanup action --- psiflow/data/dataset.py | 10 +++++----- psiflow/data/utils.py | 7 +++---- psiflow/free_energy/phonons.py | 2 +- psiflow/learning.py | 6 ++++-- psiflow/metrics.py | 6 ++++-- psiflow/reference/orca_.py | 4 ++-- psiflow/reference/reference.py | 5 +++-- psiflow/sampling/ase.py | 3 +-- psiflow/sampling/optimize.py | 2 +- psiflow/sampling/sampling.py | 2 +- psiflow/utils/apps.py | 4 +--- tests/test_learning.py | 2 +- 12 files changed, 27 insertions(+), 26 deletions(-) diff --git a/psiflow/data/dataset.py b/psiflow/data/dataset.py index e332167..660928c 100644 --- a/psiflow/data/dataset.py +++ b/psiflow/data/dataset.py @@ -13,7 +13,7 @@ import psiflow from psiflow.geometry import QUANTITIES, Geometry -from psiflow.utils.apps import combine_futures, copy_data_future, unpack_i +from psiflow.utils.apps import copy_data_future, pack from .utils import ( align_axes, @@ -118,7 +118,7 @@ def __getitem__( inputs=[self.extxyz], outputs=[], # will return Geometry as Future ) - return unpack_i(future, 0) + return future[0] else: # slice, list, AppFuture extxyz = read_frames( index, @@ -266,9 +266,9 @@ def get( inputs=[self.extxyz], ) if len(quantities) == 1: - return unpack_i(result, 0) + return result[0] else: - return tuple([unpack_i(result, i) for i in range(len(quantities))]) + return tuple([result[i] for i in range(len(quantities))]) def evaluate( self, @@ -301,7 +301,7 @@ def evaluate( outputs = [outputs] future = insert_quantities( quantities=tuple(computable.outputs), - arrays=combine_futures(inputs=list(outputs)), + arrays=pack(*outputs), inputs=[self.extxyz], outputs=[psiflow.context().new_file("data_", ".xyz")], ) diff --git a/psiflow/data/utils.py b/psiflow/data/utils.py index 9078de1..0f54936 100644 --- a/psiflow/data/utils.py +++ b/psiflow/data/utils.py @@ -1,6 +1,6 @@ import re import shutil -from typing import Optional, Union +from typing import Optional, Union, Sequence import numpy as np import typeguard @@ -9,7 +9,6 @@ from parsl.dataflow.futures import AppFuture from psiflow.geometry import Geometry, NullState, _assign_identifier, create_outputs -from psiflow.utils.apps import unpack_i @typeguard.typechecked @@ -206,7 +205,7 @@ def _extract_quantities( @typeguard.typechecked def _insert_quantities( quantities: tuple[str, ...], - arrays: list[np.ndarray, ...], + arrays: Sequence[np.ndarray], data: Optional[list[Geometry]] = None, inputs: list = [], outputs: list = [], @@ -761,7 +760,7 @@ def get_train_valid_indices( tuple[AppFuture, AppFuture]: Futures for training and validation indices. """ future = train_valid_indices(effective_nstates, train_valid_split, shuffle) - return unpack_i(future, 0), unpack_i(future, 1) + return future[0], future[1] @typeguard.typechecked diff --git a/psiflow/free_energy/phonons.py b/psiflow/free_energy/phonons.py index 2f449ef..8728fd5 100644 --- a/psiflow/free_energy/phonons.py +++ b/psiflow/free_energy/phonons.py @@ -20,7 +20,7 @@ from psiflow.sampling.optimize import setup_forces from psiflow.utils.apps import multiply from psiflow.utils.io import load_numpy, save_xml -from psiflow.execution import format_env_vars +from psiflow.utils.parse import format_env_vars def _compute_frequencies(hessian: np.ndarray, geometry: Geometry) -> np.ndarray: diff --git a/psiflow/learning.py b/psiflow/learning.py index 3191a14..eab93d1 100644 --- a/psiflow/learning.py +++ b/psiflow/learning.py @@ -1,6 +1,7 @@ from __future__ import annotations # necessary for type-guarding class methods import shutil +import logging from pathlib import Path from typing import Optional, Union @@ -17,9 +18,10 @@ from psiflow.models import Model from psiflow.reference import Reference from psiflow.sampling import SimulationOutput, Walker, sample -from psiflow.utils.apps import boolean_or, isnan, setup_logger, unpack_i +from psiflow.utils.apps import boolean_or, isnan -logger = setup_logger(__name__) + +logger = logging.getLogger(__name__) # logging per module @typeguard.typechecked diff --git a/psiflow/metrics.py b/psiflow/metrics.py index 2f73386..0234642 100644 --- a/psiflow/metrics.py +++ b/psiflow/metrics.py @@ -1,6 +1,7 @@ from __future__ import annotations # necessary for type-guarding class methods import os +import logging from pathlib import Path from typing import Optional, Union @@ -15,9 +16,10 @@ from psiflow.hamiltonians import Hamiltonian from psiflow.models import Model from psiflow.sampling import SimulationOutput -from psiflow.utils.apps import combine_futures, log_message, setup_logger +from psiflow.utils.apps import log_message +# from psiflow.utils.apps import combine_futures, log_message, setup_logger -logger = setup_logger(__name__) +logger = logging.getLogger(__name__) # logging per module @typeguard.typechecked diff --git a/psiflow/reference/orca_.py b/psiflow/reference/orca_.py index 509aa25..cc2185b 100644 --- a/psiflow/reference/orca_.py +++ b/psiflow/reference/orca_.py @@ -12,7 +12,7 @@ import psiflow from psiflow.geometry import Geometry from psiflow.reference.reference import Reference, Status, get_spin_multiplicities -from psiflow.utils.parse import find_line, lines_to_array, string_to_timedelta +from psiflow.utils.parse import find_line, lines_to_array, str_to_timedelta KEY_GHOST = "ghost" @@ -94,7 +94,7 @@ def parse_output(stdout: str, properties: tuple[str, ...]) -> dict: if status == Status.SUCCESS: # total runtime idx = find_line(lines, "TOTAL RUN TIME", reverse=True, max_lines=5) - data["runtime"] = string_to_timedelta(lines[idx][16:]) + data["runtime"] = str_to_timedelta(lines[idx][16:]) # read coordinates idx_start = idx = find_line(lines, "CARTESIAN COORDINATES (ANGSTROEM)") + 2 diff --git a/psiflow/reference/reference.py b/psiflow/reference/reference.py index dc29ec2..d049d9a 100644 --- a/psiflow/reference/reference.py +++ b/psiflow/reference/reference.py @@ -1,6 +1,7 @@ from __future__ import annotations # necessary for type-guarding class methods import warnings +import logging from typing import ClassVar, Optional, Union, Callable, Sequence from pathlib import Path from functools import partial @@ -16,11 +17,11 @@ from psiflow.data import Computable, Dataset from psiflow.data.utils import extract_quantities from psiflow.geometry import Geometry, NullState -from psiflow.utils.apps import copy_app_future, setup_logger +from psiflow.utils.apps import copy_app_future from psiflow.utils.parse import LineNotFoundError, get_task_name_id -logger = setup_logger(__name__) # logging per module +logger = logging.getLogger(__name__) # logging per module class Status(Enum): diff --git a/psiflow/sampling/ase.py b/psiflow/sampling/ase.py index 4775c1a..69dde69 100644 --- a/psiflow/sampling/ase.py +++ b/psiflow/sampling/ase.py @@ -11,8 +11,7 @@ from psiflow.geometry import Geometry from psiflow.hamiltonians import Hamiltonian from psiflow.utils.io import _dump_json -from psiflow.utils.parse import get_task_name_id -from psiflow.execution import format_env_vars +from psiflow.utils.parse import get_task_name_id, format_env_vars from ._ase import ALLOWED_MODES, __file__ as file_ase diff --git a/psiflow/sampling/optimize.py b/psiflow/sampling/optimize.py index cb531a4..3d09880 100644 --- a/psiflow/sampling/optimize.py +++ b/psiflow/sampling/optimize.py @@ -20,7 +20,7 @@ ) from psiflow.sampling.output import HamiltonianComponent from psiflow.utils.io import save_xml -from psiflow.execution import format_env_vars +from psiflow.utils.parse import format_env_vars warnings.warn( diff --git a/psiflow/sampling/sampling.py b/psiflow/sampling/sampling.py index e37e1b5..dcb8fa8 100644 --- a/psiflow/sampling/sampling.py +++ b/psiflow/sampling/sampling.py @@ -21,7 +21,7 @@ potential_component_name, HamiltonianComponent, ) -from psiflow.execution import format_env_vars +from psiflow.utils.parse import format_env_vars from psiflow.sampling.utils import create_xml_list from psiflow.sampling.walker import Coupling, Walker, partition, Ensemble from psiflow.utils.io import _save_xml diff --git a/psiflow/utils/apps.py b/psiflow/utils/apps.py index 894619f..773915a 100644 --- a/psiflow/utils/apps.py +++ b/psiflow/utils/apps.py @@ -77,6 +77,7 @@ def _log_message(logger, message, *futures): @python_app(executors=["default_threads"]) def pack(*args: Any) -> tuple[Any]: + """Combine passed futures into a single future.""" return args @@ -112,6 +113,3 @@ def create_bash_template(tmpdir_root: str, keep_tmpdirs: bool) -> str: """ return textwrap.dedent(template) - -combine_futures = None -unpack_i = None diff --git a/tests/test_learning.py b/tests/test_learning.py index e450a97..430a6ca 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -9,7 +9,7 @@ from psiflow.metrics import Metrics, _create_table, parse_walker_log, reconstruct_dtypes from psiflow.reference import ReferenceDummy from psiflow.sampling import SimulationOutput, Walker -from psiflow.utils.apps import combine_futures +# from psiflow.utils.apps import combine_futures # use pack instead from psiflow.utils.io import _load_metrics, _save_metrics, load_metrics, save_metrics From bcfda23474531125370eb683742f121f8cedc9af Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Wed, 11 Mar 2026 22:35:49 +0100 Subject: [PATCH 13/15] final tweaks --- configs/local_test.yaml | 2 + psiflow/execution.py | 108 +++++++++++++++++++++------------------ psiflow/utils/logging.py | 2 +- pyproject.toml | 5 +- tests/conftest.py | 2 - tests/test_execution.py | 59 +++++++++++++++++++++ 6 files changed, 122 insertions(+), 56 deletions(-) create mode 100644 tests/test_execution.py diff --git a/configs/local_test.yaml b/configs/local_test.yaml index b44c2af..1342c39 100644 --- a/configs/local_test.yaml +++ b/configs/local_test.yaml @@ -1,4 +1,6 @@ --- +psiflow_log_level: INFO + ModelEvaluation: executor: threadpool max_threads: 4 diff --git a/psiflow/execution.py b/psiflow/execution.py index 34b643d..4d4aaef 100644 --- a/psiflow/execution.py +++ b/psiflow/execution.py @@ -44,6 +44,21 @@ class ConfigurationError(ValueError): pass # some global psiflow configuration option does not make sense +def ensure( + *conditions: bool, + msg: str = "Whoopsie", + msgs: Sequence[str] = (), + template: str = "{}", +) -> None: + """Small helper function to replace 'assert' statements""" + if all(conditions): + return + if len(msgs) == 0: + raise ConfigurationError(msg) + msg = msgs[conditions.index(False)] + raise ConfigurationError(template.format(msg)) + + @dataclass class ContainerSpec: """Controls container configuration""" @@ -54,19 +69,17 @@ class ContainerSpec: gpu_flavour: str | None = None def __post_init__(self): - assert self.engine in ("apptainer", "singularity") - assert len(self.uri) > 0 - assert self.gpu_flavour in ("cuda", "rocm", None) + ensure( + self.engine in ("apptainer", "singularity"), + len(self.uri) > 0, + self.gpu_flavour in ("cuda", "rocm", None), + msg="Invalid container configuration", + ) def launch_command(self) -> str: pwd = Path.cwd().resolve() # access to data / internal dir - args = [self.engine, "run", self.addopts, f"--bind {pwd}"] - if self.gpu_flavour == "cuda": - args.append("--nv") - elif self.gpu_flavour == "rocm": - args.append("--rocm") - args.append(self.uri) - return " ".join(args) + gpu = {"cuda": "--nv", "rocm": "--rocm"}.get(self.gpu_flavour, "") + return f"{self.engine} run {self.addopts} {gpu} --bind {pwd} {self.uri}" class ReferenceSpec(Protocol): @@ -140,15 +153,15 @@ def launch_command(self): def make_slurm_provider(kwargs: dict) -> tuple[SlurmProvider, dict]: defaults = {"init_blocks": 0, "exclusive": False} - required = ("cores_per_node", "walltime", "gpus_per_node") + required = ("cores_per_node", "walltime") kwargs = defaults | kwargs - assert all(key in kwargs for key in required) + ensure(all(key in kwargs for key in required)) provider = SlurmProvider(**kwargs) # does not configure Launcher resources = { "nodes": provider.nodes_per_block, "cores": provider.cores_per_node, "memory": provider.mem_per_node or float("inf"), - "gpus": provider.gpus_per_node, + "gpus": provider.gpus_per_node or 0, "lifetime": str_to_timedelta(provider.walltime).seconds, } return provider, resources @@ -224,26 +237,23 @@ def __init__( self.container = container if self.use_gpu: - msg = "" - if resources["gpus"] == 0: - msg = "GPU usage requested but no GPUs available" - elif container is not None and container.gpu_flavour is None: - msg = "Provide container 'gpu_flavour' to choose between CUDA and ROCM" - if msg: - raise ConfigurationError(msg) + ensure( + resources["gpus"] > 0, msg="GPU usage requested but no GPUs available" + ) + ensure( + container is None or container.gpu_flavour is not None, + msg="Provide container 'gpu_flavour' to choose between CUDA and ROCM", + ) if self.executor_type == "workqueue": # WQ-specific checks - msg = "" - if self.kwargs["gpus_per_task"] > resources["gpus"]: - msg = "GPUs" - if self.kwargs["cores_per_task"] > resources["cores"]: - msg = "cores" - if self.kwargs["mem_per_task"] > resources["memory"]: - msg = "memory" - if msg: - msg = f"Apps will request more {msg} than available per Parsl block" - raise ConfigurationError(msg) + ensure( + self.kwargs["gpus_per_task"] <= resources["gpus"], + self.kwargs["cores_per_task"] <= resources["cores"], + self.kwargs["mem_per_task"] <= resources["memory"], + msgs=["GPUs", "cores", "memory"], + template="Apps will request more {} than available per Parsl block", + ) # how long can individual tasks run (in seconds) if max_runtime is None: @@ -313,7 +323,8 @@ def cores_per_task(self) -> int: if self.executor_type == "workqueue": return self.kwargs["cores_per_task"] # assumes all threads are working - return int(self.resources["cores"] / self.kwargs["max_threads"]) + cores_per_thread = self.resources["cores"] / self.kwargs["max_threads"] + return max(int(cores_per_thread), 1) @property def task_slots(self) -> int: @@ -335,12 +346,11 @@ def wrap_in_timeout(self, command: str) -> str: # send SIGTERM after max_runtime, follow with SIGKILL 30s later return f"timeout -k 30s {self.max_runtime}s {command}" - # def wrap_in_srun(self, command: str) -> str: - # # TODO: stub -- this does not work - # if self.provider is None: - # return command # noop - - return f"srun -t 1 -c $CORES {command}" + # def wrap_in_srun(self, command: str) -> str: + # # TODO: stub -- this does not work + # if self.provider is None: + # return command # noop + # return f"srun -t 1 -c $CORES {command}" def _create_threadpool(self, path: Path) -> ThreadPoolExecutor: max_threads = self.kwargs["max_threads"] @@ -399,11 +409,12 @@ def from_config( **kwargs, ): if executor == "threadpool": - assert container is None, "Threadpool not compatible with containers" - assert ( - "slurm" not in kwargs - ), "Threadpool not compatible with remote execution" - assert "max_threads" in kwargs, "Specify 'max_threads' for parallelism" + ensure(container is None, msg="Threadpool not compatible with containers") + ensure("max_threads" in kwargs, msg="Specify 'max_threads' for parallelism") + ensure( + "slurm" not in kwargs, + msg="Threadpool not compatible with remote execution", + ) executor_kwargs = { "max_threads": kwargs["max_threads"], "use_gpu": kwargs.get("use_gpu", False), @@ -414,15 +425,12 @@ def from_config( "gpus_per_task": kwargs.get("gpus_per_task", 0), "mem_per_task": kwargs.get("mem_per_task", 0), } - assert ( - executor_kwargs["cores_per_task"] > 0 - ), "WQ needs at least one core to launch tasks" + if executor_kwargs["cores_per_task"] == 0: + raise ConfigurationError("WQ needs at least one core to launch tasks") min_runtime = kwargs.get("min_runtime", "00:00:00") executor_kwargs["min_runtime"] = str_to_timedelta(min_runtime).seconds else: - raise ConfigurationError( - "Key 'executor' must be 'threadpool' or 'workqueue'" - ) + raise ConfigurationError("Invalid executor key") # search for Parsl ExecutionProvider block, defaulting to "local" if "slurm" in kwargs: @@ -448,7 +456,7 @@ def from_config( class ModelEvaluation(ExecutionDefinition): def __init__( self, - timeout: float = 5.0, + timeout: float = 10.0, max_resource_multiplier: int | None = None, allow_oversubscription: bool = True, **kwargs, @@ -625,7 +633,7 @@ def __init__( self.path.mkdir(parents=True, exist_ok=True) self.definitions = {d.name: d for d in definitions} - assert len(self.definitions) == len(definitions) + ensure(len(self.definitions) == len(definitions)) # make sure task tmpdirs can be made Path(tmpdir_root).mkdir(parents=True, exist_ok=True) diff --git a/psiflow/utils/logging.py b/psiflow/utils/logging.py index 0976590..373a613 100644 --- a/psiflow/utils/logging.py +++ b/psiflow/utils/logging.py @@ -12,7 +12,7 @@ def setup_logging(file: Path, level=logging.INFO) -> None: fh = logging.FileHandler(file) formatter = logging.Formatter( - fmt='%(asctime)s %(name)s [%(levelname)s] %(message)s', + fmt='%(asctime)s [%(levelname)s] %(name)s \t %(message)s', datefmt='%Y-%m-%d %H:%M' ) fh.setFormatter(formatter) diff --git a/pyproject.toml b/pyproject.toml index 1f20c23..898ab5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "psiflow" -version = "4.0.1" +version = "4.0.2" description = "Library for developing interatomic potentials" readme = "README.md" requires-python = ">=3.10" @@ -13,12 +13,11 @@ dependencies = [ "ase>=3.23.0", "pyyaml>=6.0", "numpy>=1.22.3, <2", - "parsl==2024.12.16", + "parsl==2026.02.16", "prettytable", "psutil", "cp2k-input-tools @ git+https://github.com/cp2k/cp2k-input-tools.git@3b9929735dcb3c8c0620a548b1fe20efecbad077", # need 2024.1 "ipi @ git+https://github.com/i-pi/i-pi.git@v3.1.10", - "pytimeparse", ] diff --git a/tests/conftest.py b/tests/conftest.py index e09e86e..2052219 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,3 @@ -import xml.etree.ElementTree as ET from dataclasses import asdict from pathlib import Path @@ -44,7 +43,6 @@ def context(request, tmp_path_factory): path_config = Path(request.config.getoption("--psiflow-config")) with open(path_config, "r") as f: psiflow_config = yaml.safe_load(f) - psiflow_config["path"] = tmp_path_factory.mktemp("psiflow_internal") psiflow.load(psiflow_config) context = psiflow.context() # noqa: F841 yield diff --git a/tests/test_execution.py b/tests/test_execution.py new file mode 100644 index 0000000..6a456d9 --- /dev/null +++ b/tests/test_execution.py @@ -0,0 +1,59 @@ +import yaml + +from psiflow.execution import ExecutionDefinition + + +def test_execution(): + """Check a few execution parameters. This could definitely be expanded upon.""" + data_yaml = """ + executor: threadpool + max_threads: 42 + use_gpu: false + max_runtime: 00:59:00 + local: + cores: 4 + memory: 8 + """ + definition = ExecutionDefinition.from_config(**yaml.safe_load(data_yaml)) + assert definition.executor_type == "threadpool" + assert definition.provider is None + assert definition.container is None + assert definition.lifetime == float("inf") + assert definition.max_runtime == 3540 + assert definition.task_slots == 42 + assert definition.cores_per_task == 1 + assert definition.use_gpu == False + assert definition.spec is None + + data_yaml = """ + cores_per_task: 3 + gpus_per_task: 0 + mem_per_task: 6 + min_runtime: 00:15:00 + env_vars: + CUSTOM_KEY: custom_var + slurm: + cores_per_node: 8 + mem_per_node: 16 + walltime: "02:00:00" + """ + spec = { + "cores": 3, + "disk": 0, + "gpus": 0, + "memory": 6000, + "priority": 0, + "running_time_min": 900, + } + data = yaml.safe_load(data_yaml) + definition = ExecutionDefinition.from_config(**data) + assert definition.executor_type == "workqueue" + assert definition.provider is not None + assert definition.lifetime == 7200 + assert definition.max_runtime == 7140 + assert definition.task_slots == 2 + assert definition.cores_per_task == 3 + assert definition.use_gpu == False + assert definition.spec == spec + assert definition.env_vars["CUSTOM_KEY"] == "CUSTOM_VAR" + From 9259f8014729fd52ba217f4ffc4bb7ff4d30bf3e Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Mon, 30 Mar 2026 13:19:27 +0200 Subject: [PATCH 14/15] update example configs --- configs/example_local_debug.yaml | 15 ++++++++ configs/example_tier0_lumi.yaml | 21 ++++++++++ configs/example_tier1_hortense.yaml | 19 ++++++++++ configs/example_tier2_ugent.yaml | 59 +++++++++++++++++++++++++++++ configs/local_test.yaml | 5 +-- configs/old_hortense.yaml | 42 -------------------- configs/old_lumi.yaml | 39 ------------------- configs/old_threadpool.yaml | 29 -------------- configs/old_wq.yaml | 17 --------- psiflow/utils/config.py | 2 +- 10 files changed, 116 insertions(+), 132 deletions(-) create mode 100644 configs/example_local_debug.yaml create mode 100644 configs/example_tier0_lumi.yaml create mode 100644 configs/example_tier1_hortense.yaml create mode 100644 configs/example_tier2_ugent.yaml delete mode 100644 configs/old_hortense.yaml delete mode 100644 configs/old_lumi.yaml delete mode 100644 configs/old_threadpool.yaml delete mode 100644 configs/old_wq.yaml diff --git a/configs/example_local_debug.yaml b/configs/example_local_debug.yaml new file mode 100644 index 0000000..385e0e4 --- /dev/null +++ b/configs/example_local_debug.yaml @@ -0,0 +1,15 @@ +# log all of the messages +parsl_log_level: DEBUG +psiflow_log_level: DEBUG + +# tell Parsl to not clean up completed tasks from the DFK +# you can print out the full list of tasks with 'log_dfk_tasks' from psiflow.utils.logging +garbage_collect: false + +# tell psiflow to execute every bash app in a specified directory - and not clean up afterwards +# allows you to track which files are created in each task +tmpdir_root: ~/psiflow_tasks +keep_tmpdirs: true + +# not specifying ModelEvaluation or ModelTraining falls back to the default in 'psiflow.utils.config' +# this does not include any ReferenceEvaluation blocks, so Reference apps will not run with this configuration diff --git a/configs/example_tier0_lumi.yaml b/configs/example_tier0_lumi.yaml new file mode 100644 index 0000000..13a6328 --- /dev/null +++ b/configs/example_tier0_lumi.yaml @@ -0,0 +1,21 @@ +# Tier-0 LUMI requires an active research project to submit calculations + +# run ModelEvaluation and ModelTraining in a psiflow container +container: + uri: oras://ghcr.io/molmod/psiflow:4.0.0_rocm6.2 # outdated uri + engine: singularity # LUMI uses singularity instead of apptainer + gpu_flavour: rocm # GPU nodes are of the AMD kind + +# not specifying max_runtime defaults to (just shy of) job walltime +ModelTraining: + cores_per_task: 56 + gpus_per_task: 8 + slurm: + partition: standard-g + account: project_465001125 + nodes_per_block: 1 + cores_per_node: 56 + gpus_per_node: 8 + max_blocks: 1 + walltime: "12:00:00" + scheduler_options: "#SBATCH --clusters=dodrio\n#SBATCH --gpus=1\n" diff --git a/configs/example_tier1_hortense.yaml b/configs/example_tier1_hortense.yaml new file mode 100644 index 0000000..8bfe95c --- /dev/null +++ b/configs/example_tier1_hortense.yaml @@ -0,0 +1,19 @@ +# Tier-1 Hortense requires an active research project to submit calculations +# always make sure to 'unset SBATCH_PARTITION' before submitting anything + +# when not using a psiflow container, you need to make sure all necessary software is available on the worker node +# this can be through installed modules, custom environments, ... +ModelEvaluation: + cores_per_task: 8 + gpus_per_task: 1 + max_runtime: 06:00:00 + slurm: + partition: gpu_rome_a100 # specify node partition where jobs should run + account: 2026_042 # specify your active computational grant + nodes_per_block: 1 + cores_per_node: 32 + gpus_per_block: 4 + max_blocks: 10 + walltime: 06:30:00 + worker_init: micromamba activate my-mace-env # load an environment with the appropriate software + diff --git a/configs/example_tier2_ugent.yaml b/configs/example_tier2_ugent.yaml new file mode 100644 index 0000000..bc87a77 --- /dev/null +++ b/configs/example_tier2_ugent.yaml @@ -0,0 +1,59 @@ +# Tier-2 UGent divides its compute nodes over 'clusters' rather than partitions +# always make sure to 'unset SLURM_CLUSTERS' before submitting anything + +# retry failed apps once +retries: 1 + +# psiflow always launches one HighThroughputExecutor and one ThreadPoolExecutor to handle internal apps (IO, parsing) +# these tasks run locally and should never do any real computational work +# default_threads needs to mainly be high enough to avoid concurrency bottlenecks +default_threads: 8 + +# run ModelEvaluation and ModelTraining in a psiflow container +container: + uri: oras://ghcr.io/molmod/psiflow:4.0.0_cu118 # outdated uri + engine: apptainer + gpu_flavour: cuda # required for GPU usage + +# tasks will use (at least) two cores (no GPUs) and can run for six hours at most +# parsl asks SLURM for 16 cores on either doduo or shinx, so 8 tasks can run per SLURM job +# only 10 SLURM jobs will run at once +ModelEvaluation: + cores_per_task: 2 + max_runtime: 06:00:00 + slurm: + nodes_per_block: 1 + cores_per_node: 16 + max_blocks: 10 + walltime: 08:00:00 + clusters: doduo,shinx + +# tasks will use 8 cores and one GPU, running for four hours at most +# parsl asks SLURM for 12 cores + 1 GPU on accelgor, so one task can run per SLURM job +ModelTraining: + cores_per_task: 12 + gpus_per_task: 1 + max_runtime: 04:00:00 + slurm: + nodes_per_block: 1 + cores_per_node: 12 + gpus_per_node: 1 + walltime: 04:00:00 + clusters: accelgor + +# tasks will use 32 cores (by default), running for one hour at most +# tasks will be killed when they exceed 64 GB of memory usage +# parsl asks SLURM for 32 cores on doduo or shinx, so one task can run per SLURM job +# tells WQ to load a CP2K module before trying to launch calculations +CP2K: + cores_per_task: 32 + max_runtime: 01:00:00 + memory_limit: 64 + slurm: + nodes_per_block: 1 + cores_per_node: 32 + mem_per_node: 64 + walltime: 04:00:00 + max_blocks: 50 + clusters: doduo,shinx + worker_init: ml CP2K/2023.1-foss-2023a diff --git a/configs/local_test.yaml b/configs/local_test.yaml index 1342c39..9381058 100644 --- a/configs/local_test.yaml +++ b/configs/local_test.yaml @@ -1,4 +1,3 @@ ---- psiflow_log_level: INFO ModelEvaluation: @@ -11,7 +10,6 @@ ModelTraining: max_threads: 4 max_runtime: 00:00:20 - CP2K: executor: workqueue cores_per_task: 2 @@ -23,6 +21,7 @@ CP2K: GPAW: executor: workqueue cores_per_task: 2 + max_runtime: 00:00:20 container: uri: oras://ghcr.io/molmod/gpaw:24.1 @@ -30,5 +29,3 @@ ORCA: executor: workqueue cores_per_task: 2 - -... diff --git a/configs/old_hortense.yaml b/configs/old_hortense.yaml deleted file mode 100644 index d6ccc68..0000000 --- a/configs/old_hortense.yaml +++ /dev/null @@ -1,42 +0,0 @@ ---- -parsl_log_level: WARNING -container_engine: 'apptainer' -container_uri: 'oras://ghcr.io/molmod/psiflow:4.0.0_cu118' -default_threads: 8 -ModelEvaluation: - cores_per_worker: 12 - gpu: True - max_simulation_time: 20 - slurm: - partition: "gpu_rome_a100" - account: "2023_070" - nodes_per_block: 1 - cores_per_node: 48 - max_blocks: 1 - walltime: "12:00:00" - scheduler_options: "#SBATCH --clusters=dodrio\n#SBATCH --gpus=4\n" -ModelTraining: - cores_per_worker: 12 - gpu: true - max_training_time: 40 - slurm: - partition: "gpu_rome_a100" - account: "2023_070" - nodes_per_block: 1 - cores_per_node: 12 - max_blocks: 1 - walltime: "12:00:00" - scheduler_options: "#SBATCH --clusters=dodrio\n#SBATCH --gpus=1\n" -CP2K: - cores_per_worker: 64 - max_evaluation_time: 30 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 -bind-to core cp2k.psmp' - slurm: - partition: "cpu_rome" - account: "2024_079" - nodes_per_block: 1 - cores_per_node: 64 - max_blocks: 25 - walltime: "06:00:00" - scheduler_options: "#SBATCH --clusters=dodrio\n" -... diff --git a/configs/old_lumi.yaml b/configs/old_lumi.yaml deleted file mode 100644 index b5e9a14..0000000 --- a/configs/old_lumi.yaml +++ /dev/null @@ -1,39 +0,0 @@ ---- -parsl_log_level: WARNING -container_engine: 'singularity' -container_uri: 'oras://ghcr.io/molmod/psiflow:4.0.0_rocm6.2' -default_threads: 8 -CP2K: - cores_per_worker: 32 - max_evaluation_time: 20 - launch_command: 'singularity exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 cp2k.psmp' - slurm: - partition: "standard" - account: "project_465001125" - nodes_per_block: 1 - cores_per_node: 128 - max_blocks: 10 - walltime: "01:00:00" -ModelEvaluation: - cores_per_worker: 7 - gpu: True - slurm: - partition: "standard-g" - account: "project_465001125" - nodes_per_block: 1 - cores_per_node: 56 - max_blocks: 10 - walltime: "01:00:00" - scheduler_options: "#SBATCH --gres=gpu:8\n" -ModelTraining: - cores_per_worker: 7 - gpu: true - multigpu: true - slurm: - partition: "standard-g" - account: "project_465001125" - nodes_per_block: 1 - cores_per_node: 56 - walltime: "01:00:00" - scheduler_options: "#SBATCH --gres=gpu:8\n" -... diff --git a/configs/old_threadpool.yaml b/configs/old_threadpool.yaml deleted file mode 100644 index 33f6c57..0000000 --- a/configs/old_threadpool.yaml +++ /dev/null @@ -1,29 +0,0 @@ ---- -parsl_log_level: WARNING -retries: 0 -ModelEvaluation: - gpu: false - use_threadpool: true - max_simulation_time: 0.4 -ModelTraining: - gpu: true - use_threadpool: true - max_training_time: 1 - max_workers: 1 # suppress assertion for multigpu training -CP2K: - cores_per_worker: 2 - max_evaluation_time: 0.3 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -bind-to core -np 2 -env OMP_NUM_THREADS 1 cp2k.psmp' -CP2K_container: - cores_per_worker: 2 - max_evaluation_time: 0.3 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -bind-to core -np 2 -env OMP_NUM_THREADS 1 cp2k.psmp' -GPAW: - cores_per_worker: 2 - max_evaluation_time: 0.3 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/gpaw:24.1 /opt/entry.sh mpirun -np 2 gpaw python /opt/run_gpaw.py' -GPAW_container: - cores_per_worker: 2 - max_evaluation_time: 0.3 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/gpaw:24.1 /opt/entry.sh mpirun -np 2 gpaw python /opt/run_gpaw.py' -... diff --git a/configs/old_wq.yaml b/configs/old_wq.yaml deleted file mode 100644 index 660d784..0000000 --- a/configs/old_wq.yaml +++ /dev/null @@ -1,17 +0,0 @@ ---- -parsl_log_level: WARNING -default_threads: 4 -ModelEvaluation: - cores_per_worker: 4 - gpu: True - max_simulation_time: 0.4 -ModelTraining: - cores_per_worker: 4 - gpu: true - max_training_time: 1 - max_workers: 1 -CP2K: - cores_per_worker: 2 - max_evaluation_time: 0.3 - launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 2 -x OMP_NUM_THREADS=1 cp2k.psmp' -... diff --git a/psiflow/utils/config.py b/psiflow/utils/config.py index 14be6b1..59e140b 100644 --- a/psiflow/utils/config.py +++ b/psiflow/utils/config.py @@ -8,7 +8,7 @@ parsl_log_level: WARNING psiflow_log_level: WARNING usage_tracking: 3 -default_threads: 4 +default_threads: 8 tmpdir_root: /tmp keep_tmpdirs: false From c131be11fd25027dba8ea647cdece34e26e07f0e Mon Sep 17 00:00:00 2001 From: pdobbelaere Date: Mon, 30 Mar 2026 13:19:41 +0200 Subject: [PATCH 15/15] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 898ab5b..61df7a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "ase>=3.23.0", "pyyaml>=6.0", "numpy>=1.22.3, <2", - "parsl==2026.02.16", + "parsl==2026.02.23", "prettytable", "psutil", "cp2k-input-tools @ git+https://github.com/cp2k/cp2k-input-tools.git@3b9929735dcb3c8c0620a548b1fe20efecbad077", # need 2024.1