Coverage for src/metador_core/packer/__init__.py: 63%
94 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 09:33 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 09:33 +0000
1"""Definition of HDF5 packer plugin interface."""
2from __future__ import annotations
4from abc import ABC, abstractmethod
5from io import UnsupportedOperation
6from pathlib import Path
7from typing import Callable, Tuple, Type
9import wrapt
10from overrides import EnforceOverrides, overrides
12from ..container import MetadorContainer
13from ..ih5.manifest import IH5MFRecord
14from ..plugin import interface as pg
15from ..plugins import plugingroups
16from ..schema.core import MetadataSchema
17from ..schema.plugins import PluginPkgMeta
18from ..util.diff import DirDiff
19from ..util.hashsums import DirHashsums, dir_hashsums
20from .types import DirValidationErrors
23class Packer(ABC, EnforceOverrides):
24 """Interface to be implemented by Metador HDF5 packer plugins.
26 These plugins is how support for wildly different domain-specific
27 use-cases can be added to Metador in a opt-in and loosely-coupled way.
29 Users can install only the packer plugins they need for their use-cases,
30 and such plugins can be easily developed independently from the rest
31 of the Metador tooling, as long as this interface is respected.
33 Carefully read the documentation for the required attributes and methods
34 and implement them for your use-case in a subclass.
35 See `metador_core.packer.example.GenericPacker` for an example plugin.
37 Requirements for well-behaved packers:
39 1. No closing of the container:
40 The packer gets a writable record and is only responsible for performing
41 the neccessary additions, deletions and modifications. It is not allowed
42 to `close()` the container.
44 2. No access to data in the container:
45 Data in the container MUST NOT be read or be relied on for doing an update,
46 as the nodes could be dummy stubs. One MAY rely on existence or absence of
47 Groups, Datasets, Attributes and Metadata in the container (e.g. `in` or `keys`).
49 3. Source directory is read-only:
50 Files or directories inside of `data_dir` MUST NOT be created, deleted or
51 modified by this method.
53 4. Exceptional termination:
54 In case that packing must be aborted, and exception MUST be raised.
55 If the exception happened due to invalid data or metadata, it MUST be
56 an DirValidationError object like in the other methods above, helping to find
57 and fix the problem. Otherwise, a different appropriate exception may be used.
59 5. Semantic correctness:
60 Packing a directory into a fresh container and updating an existing container
61 MUST lead to the same observable result.
63 If you cannot guarantee this in full generality, do not implement `update`.
64 In that case, if a container is updated, it will be cleared and then `pack` is
65 called on it, as if it was a fresh container. In this case, there is no space
66 advantage gained over a fresh container (but it will keep its UUID).
68 6. Semantic versioning of packers:
70 A packer MUST be able to update records that were created by this packer
71 of the same or an earlier MINOR version.
73 More formally, the version MAJOR.MINOR.PATCH
74 MUST adhere to the following contract:
76 1. increasing MAJOR means a break in backward-compatibility
77 for older datasets (i.e. new packer cannot work with old records),
79 2. increasing MINOR means a break in forward-compatibility
80 for newer datasets (i.e. older packers will not work with newer records),
82 3. increasing PATCH does not affect compatibility
83 for datasets with the same MAJOR and MINOR version.
85 When the packer is updated, the Python package version MUST increase
86 in a suitable way. As usual, whenever an earlier number is increased,
87 the following numbers are reset to zero.
89 This means, the PATCH version should increase for e.g. bugfixes that do
90 not change the structure or metadata stored in the dataset,
91 MINOR should increase whenever from now on older versions of the packer
92 would not be able to produce a valid update for a dataset created with this version,
93 but upgrading older existing datasets with this version still works.
94 Finally, MAJOR version is increased when all compatibility guarantees are off
95 and the resulting container cannot be migrated or updated automatically.
97 You SHOULD provide tooling to migrate datasets between major versions.
98 """
100 Plugin: PackerPlugin
102 @classmethod
103 @abstractmethod
104 def check_dir(cls, data_dir: Path) -> DirValidationErrors:
105 """Check whether the given directory is suitable for packing with this plugin.
107 This method will be called before `pack` or `update` and MUST detect
108 all problems (such as missing or invalid data or metadata) that can be
109 expected to be fixed by the user in preparation for the packing.
111 More specifically, it MUST cover all metadata that is to be provided directly by
112 the user (i.e. is not inferred or extracted from generated data) for the purpose
113 of packing and SHOULD try to cover as many problems with data and metadata as
114 possible to avoid failure during the actual packing process.
116 Files or directories inside of `data_dir` MUST NOT be created,
117 deleted or modified by this method.
119 Args:
120 data_dir: Directory containing all the data to be packed.
122 Returns:
123 DirValidationError initialized with a dict mapping file paths
124 (relative to `dir`) to lists of detected errors.
126 The errors must be either a string (containing a human-readable summary of all
127 problems with that file), or another dict with more granular error messages,
128 in case that the file is e.g. a JSON-compatible file subject to validation
129 with JSON Schemas.
130 """
131 raise NotImplementedError
133 @classmethod
134 @abstractmethod
135 def update(cls, mc: MetadorContainer, data_dir: Path, diff: DirDiff):
136 """Update a MetadorContainer with changes done to the data source directory.
138 The `container` is assumed to be writable, and is either empty
139 or was previously packed by a compatible version of the packer.
141 The `data_dir` is assumed to be suitable (according to `check_dir`).
143 The `diff` structure contains information about changed paths.
145 If not implemented, updates will be created by clearing the provided
146 container and using `pack` on it.
148 Args:
149 mc: Metador IH5 record to pack the data into or update
150 data_dir: Directory containing all the data to be packed
151 diff: Diff tree of dirs and files in data_dir compared to a previous state
152 """
153 # default fallback implementation using pack
154 for obj in [mc, mc.attrs, mc.meta]:
155 for key in obj.keys():
156 del obj[key]
157 return cls.pack(mc, data_dir)
159 @classmethod
160 @abstractmethod
161 def pack(cls, mc: MetadorContainer, data_dir: Path):
162 """Pack a directory into an MetadorContainer.
164 The `container` is assumed to be writable and empty.
166 The `data_dir` is assumed to be suitable (according to `check_dir`).
168 If not implemented, initial packing is done using `update`
169 with an empty container and a diff containing all the files.
171 Args:
172 mc: Metador IH5 record to pack the data into or update
173 data_dir: Directory containing all the data to be packed
174 """
175 # default fallback implementation using update
176 return cls.update(mc, data_dir, DirDiff.compare({}, dir_hashsums(data_dir)))
179class PackerInfo(MetadataSchema):
180 """Schema for info about the packer that was used to create a container."""
182 class Plugin:
183 name = "core.packerinfo"
184 version = (0, 1, 0)
186 packer: PGPacker.PluginRef
187 """Packer plugin used to pack the container."""
189 pkg: PluginPkgMeta
190 """Python package that provides the packer plugin."""
192 source_dir: DirHashsums = {}
193 """Directory skeleton with hashsums of files at the time of packing."""
195 @classmethod
196 def for_packer(cls, packer_name: str, packer_version=None) -> PackerInfo:
197 from ..plugins import packers
199 p_ref = packers.resolve(packer_name, packer_version)
200 return PackerInfo(
201 packer=p_ref,
202 pkg=packers.provider(p_ref),
203 )
206class Unclosable(wrapt.ObjectProxy):
207 """Wrapper to prevent packers from closing/completing a container file."""
209 _self_MSG = "Packers must not finalize the container!"
211 def close(self):
212 raise UnsupportedOperation(self._self_MSG)
214 # specific for IH5Record subtypes:
216 def discard_patch(self):
217 raise UnsupportedOperation(self._self_MSG)
219 def commit_patch(self):
220 raise UnsupportedOperation(self._self_MSG)
223PACKER_GROUP_NAME = "packer"
226class PackerPlugin(pg.PluginBase):
227 ...
230class PGPacker(pg.PluginGroup[Packer]):
231 """Packer plugin group interface."""
233 class Plugin:
234 name = PACKER_GROUP_NAME
235 version = (0, 1, 0)
236 plugin_class = Packer
237 plugin_info_class = PackerPlugin
238 requires = [
239 plugingroups.PluginRef(name="schema", version=(0, 1, 0)),
240 plugingroups.PluginRef(name="harvester", version=(0, 1, 0)),
241 ]
243 _PACKER_INFO_NAME = PackerInfo.Plugin.name
245 @overrides
246 def check_plugin(self, ep_name: str, plugin: Type[Packer]):
247 pg.util.check_implements_method(ep_name, plugin, Packer.check_dir)
248 missing_pack = pg.util.implements_method(plugin, Packer.pack)
249 missing_update = pg.util.implements_method(plugin, Packer.update)
250 if missing_pack and missing_update:
251 raise TypeError(f"{ep_name}: Neither pack nor update are implemented!")
253 # ----
255 def _prepare(self, pname: str, srcdir: Path) -> Tuple[Type[Packer], DirHashsums]:
256 """Return packer class and hashsums of given directory.
258 Raises an exception if packer is not found or `packer.check_dir` fails.
259 """
260 packer = self[pname]
261 if errs := packer.check_dir(srcdir):
262 raise errs
263 return (packer, dir_hashsums(srcdir))
265 def pack(
266 self, packer_name: str, data_dir: Path, target: Path, h5like_cls: Callable
267 ):
268 """Pack a directory into a container using an installed packer.
270 `packer_name` must be an installed packer plugin.
272 `data_dir` must be an existing directory suitable for the packer.
274 `target` must be a non-existing path and will be passed into `h5like_cls` as-is.
276 `h5like_cls` must be a class compatible with MetadorContainer.
278 In case an exception happens during packing, notice that no cleanup is done.
280 The user is responsible for removing inconsistent files that were created.
282 Args:
283 packer_name: installed packer plugin name
284 data_dir: data source directory
285 target: target path for resulting container
286 h5like_cls: class to use for creating the container
287 """
288 packer, hashsums = self._prepare(packer_name, data_dir)
289 # use skel_only to enforce stub-compatibility of packer
290 container = MetadorContainer(h5like_cls(target, "x")).restrict(skel_only=True)
291 packer.pack(Unclosable(container), data_dir)
292 self._finalize(packer_name, hashsums, container)
294 def update(self, packer_name: str, data_dir: Path, target: Path, h5like_cls):
295 """Update a container from its source directory using an installed packer.
297 Like `pack`, but the `target` must be a container which can be opened
298 with the `h5like_cls` and was packed by a compatible packer.
300 In case an exception happens during packing, notice that no cleanup is done
301 and if the container has been written to, the changes persist.
303 The user is responsible for removing inconsistent files that were created
304 and ensuring that the previous state can be restored, e.g. from a backup.
305 """
306 packer, hashsums = self._prepare(packer_name, data_dir)
307 # use skel_only to enforce stub-compatibility of packer
308 container = MetadorContainer(h5like_cls(target, "r+")).restrict(skel_only=True)
310 # check compatibility
311 pinfo = container.meta.get(self._PACKER_INFO_NAME)
312 if not pinfo:
313 msg = f"Container does not have {self._PACKER_INFO_NAME} metadata!"
314 curr_ref = self.resolve(packer_name)
315 if not curr_ref.supports(pinfo.packer):
316 msg = f"{curr_ref} (installed) does not support {pinfo.packer} (container)!"
317 raise ValueError(msg)
319 diff = DirDiff.compare(pinfo.source_dir, hashsums)
320 packer.update(Unclosable(container), data_dir, diff)
321 self._finalize(packer_name, hashsums, container)
323 def _finalize(self, pname: str, hsums: DirHashsums, cont: MetadorContainer):
324 """Set or update packer info in container and close it."""
325 if self._PACKER_INFO_NAME in cont.meta:
326 del cont.meta[self._PACKER_INFO_NAME]
328 pinfo = PackerInfo.for_packer(pname)
329 pinfo.source_dir = hsums
330 cont.meta[self._PACKER_INFO_NAME] = pinfo
332 if isinstance(cont, IH5MFRecord):
333 # when using IH5MFRecord,
334 # we want the packerinfo in the manifest, so tooling can decide
335 # if a container can be updated without having it available.
336 # (default manifest already has enough info for creating stubs)
337 cont.manifest.manifest_exts[self.name] = pinfo.dict()
339 cont.close()