From ec5ebf0dc98e9d4f48bcc8732dddab3355e78ca8 Mon Sep 17 00:00:00 2001 From: Martin Lehmann Date: Fri, 16 Dec 2022 15:31:19 +0100 Subject: [PATCH] feat(loader): Selectively parse AIRD fragments This commit changes the MelodyLoader to selectively parse only the metadata section of AIRD files, and not keep the representation data in memory all the time. This dramatically reduces the runtime memory footprint and loading times for Capella models, especially when there are lots of representations in a given model. --- src/capellambse/loader/core.py | 121 +++++++++++++++++++++++++++++--- src/capellambse/model/_model.py | 3 +- src/capellambse/model/_obj.py | 3 +- 3 files changed, 117 insertions(+), 10 deletions(-) diff --git a/src/capellambse/loader/core.py b/src/capellambse/loader/core.py index 4c7da5309..cf9d052f5 100644 --- a/src/capellambse/loader/core.py +++ b/src/capellambse/loader/core.py @@ -187,7 +187,12 @@ def __missing__(self, key: str) -> t.NoReturn: class ModelFile: - """Represents a single file in the model (i.e. a fragment).""" + """Represents a single file in the model (i.e. a fragment). + + This class loads the entire XML tree into memory. This makes it + unsuitable for large trees with only small interesting segments, + like ``.aird`` files. See :class:`VisualFile` for an alternative. + """ __qtypecache: dict[etree.QName, dict[int, etree._Element]] __xtypecache: dict[str, dict[int, etree._Element]] @@ -474,6 +479,99 @@ def unfollow_href(self, element_id: str) -> etree._Element: return self.__hrefsources[element_id] +class VisualFile: + """Represents a visual (AIRD) fragment. + + Visual fragments can rapidly grow very large, which makes it + impractical to hold them in memory entirely all the time. This + specialized class works similar to :class:`ModelFile`. However, it + only keeps the central index in memory, and only loads and parses + other data on request. + """ + + fragment_type: t.Final = FragmentType.VISUAL + + def __init__( + self, + filename: pathlib.PurePosixPath, + handler: filehandler.FileHandler, + ) -> None: + self.filename = filename + self.filehandler = handler + if filename.suffix not in VISUAL_EXTS: + raise ValueError(f"Bad filename for visual fragment: {filename}") + + with handler.open(filename) as f: + parser = etree.iterparse(f) + for _, element in parser: + parent = element.getparent() + if parent is None or parent.getparent() is not None: + continue + + if element.tag == f"{{{_n.NAMESPACES['viewpoint']}}}DAnalysis": + self.__analysis = element + break + parent.remove(element) + else: + raise RuntimeError( + "Broken XML: No 'viewpoint:DAnalysis' element found" + ) + parent = self.__analysis.getparent() + assert parent is not None + parent.remove(self.__analysis) + + def __getitem__(self, key: str) -> etree._Element: + # TODO Return a diagram root element if it's found in this fragment + raise KeyError(key) + + def referenced_files(self) -> cabc.Iterator[str]: + for i in self.__analysis: + if i.tag == "semanticResources" and i.text: + yield i.text + elif i.tag == "referencedAnalysis" and (href := i.get("href")): + yield href.split("#", maxsplit=1)[0] + + def enumerate_uuids(self) -> set[str]: + """Enumerate all UUIDs used in this fragment.""" + return set() + + def idcache_index(self, subtree: etree._Element) -> None: + """Index the IDs of ``subtree``.""" + raise NotImplementedError("Cannot modify visual fragments") + + def idcache_remove(self, source: str | etree._Element) -> None: + """Remove the ID or all IDs below the source from the ID cache.""" + raise NotImplementedError("Cannot modify visual fragments") + + def idcache_rebuild(self) -> None: + """Invalidate and rebuild this file's ID cache.""" + # Nothing to do + + def idcache_reserve(self, new_id: str) -> None: + """Reserve the given ID for an element to be inserted later.""" + raise NotImplementedError("Cannot modify visual fragments") + + def iterall_xt( + self, xtypes: cabc.Container[str] + ) -> cabc.Iterator[etree._Element]: + """Iterate over all elements in this tree by ``xsi:type``.""" + del xtypes + yield from () + + def write_xml( + self, + filename: pathlib.PurePosixPath, + encoding: str = "utf-8", + ) -> None: + """Do nothing.""" + del filename, encoding + + # pylint: disable-next=useless-return + def unfollow_href(self, element_id: str) -> etree._Element | None: + del element_id + return None + + class MelodyLoader: """Facilitates extensive access to Polarsys / Capella projects.""" @@ -557,7 +655,7 @@ def __init__( else: self.resources[resname] = reshdl - self.trees: dict[pathlib.PurePosixPath, ModelFile] = {} + self.trees: dict[pathlib.PurePosixPath, ModelFile | VisualFile] = {} self.__load_referenced_files( pathlib.PurePosixPath("\0", self.entrypoint) ) @@ -610,11 +708,17 @@ def __load_referenced_files( handler = self.resources[resource_path.parts[0]] filename = pathlib.PurePosixPath(*resource_path.parts[1:]) - frag = ModelFile( - filename, handler, ignore_uuid_dups=self.__ignore_uuid_dups - ) + frag: VisualFile | ModelFile + if filename.suffix in VISUAL_EXTS: + frag = VisualFile(filename, handler) + refs = list(frag.referenced_files()) + else: + frag = ModelFile( + filename, handler, ignore_uuid_dups=self.__ignore_uuid_dups + ) + refs = [] self.trees[resource_path] = frag - for ref in _find_refs(frag.root): + for ref in refs: ref_name = helpers.normalize_pure_path( _unquote_ref(ref), base=resource_path.parent ) @@ -685,6 +789,7 @@ def update_namespaces(self) -> None: if fragment.fragment_type != FragmentType.SEMANTIC: continue + assert isinstance(fragment, ModelFile) LOGGER.debug("Updating namespaces on fragment %s", fname) fragment.update_namespaces(vp) @@ -961,7 +1066,7 @@ def iterall_xt( """ xtset = self._nonempty_hashset(xtypes) if trees is None: - files: cabc.Iterable[ModelFile] = self.trees.values() + files: cabc.Iterable[ModelFile | VisualFile] = self.trees.values() else: files = (v for k, v in self.trees.items() if k in trees) return itertools.chain.from_iterable( @@ -1296,7 +1401,7 @@ def follow_links( def _find_fragment( self, element: etree._Element - ) -> tuple[pathlib.PurePosixPath, ModelFile]: + ) -> tuple[pathlib.PurePosixPath, ModelFile | VisualFile]: root = collections.deque( itertools.chain([element], element.iterancestors()), 1 )[0] diff --git a/src/capellambse/model/_model.py b/src/capellambse/model/_model.py index 0ba7866d8..7fda5da9e 100644 --- a/src/capellambse/model/_model.py +++ b/src/capellambse/model/_model.py @@ -420,7 +420,8 @@ class as the superclass of every concrete model element trees = [ t for t in self._loader.trees.values() - if t.fragment_type is loader.FragmentType.SEMANTIC + if isinstance(t, loader.ModelFile) + and t.fragment_type is loader.FragmentType.SEMANTIC ] matches: cabc.Iterable[etree._Element] if not classes: diff --git a/src/capellambse/model/_obj.py b/src/capellambse/model/_obj.py index a0080894a..bfd87acd1 100644 --- a/src/capellambse/model/_obj.py +++ b/src/capellambse/model/_obj.py @@ -48,7 +48,7 @@ from lxml import etree import capellambse -from capellambse import helpers +from capellambse import helpers, loader from . import VIRTUAL_NAMESPACE_PREFIX, T, U, _descriptors, _pods, _styleclass @@ -725,6 +725,7 @@ def __init__( ns = self.__capella_namespace__ qtype = model.qualify_classname((ns, type(self).__name__)) assert qtype.namespace is not None + assert isinstance(fragment, loader.ModelFile) fragment.add_namespace(qtype.namespace, ns.alias) self._element.set(helpers.ATT_XT, qtype) for key, val in kw.items():