Coverage for src/metador_core/ih5/manifest.py: 100%
125 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 09:33 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 09:33 +0000
1"""Sidecar JSON file storing a skeleton to create stubs and patch containers."""
2from __future__ import annotations
4from pathlib import Path
5from typing import Any, Dict, List, Optional, Union
6from uuid import UUID, uuid1
8from pydantic import BaseModel
10from ..schema.types import QualHashsumStr
11from ..util.hashsums import qualified_hashsum
12from .record import IH5Record, IH5UserBlock, hashsum_file
13from .skeleton import IH5Skeleton, init_stub_base
16class IH5Manifest(BaseModel):
17 """A metadata sidecar file for a collection of IH5 record files.
19 It contains the skeleton of the container, in order to be able to create a
20 patch for an IH5Record the data is not locally available for (but manifest is).
21 """
23 # uuid for the manifest file itself (so the filename does not matter)
24 manifest_uuid: UUID
26 user_block: IH5UserBlock # copy of user block (without the manifest extension part)
28 skeleton: IH5Skeleton # computed with IH5Skeleton, used to create a stub
30 manifest_exts: Dict[str, Any] # Arbitrary extensions, similar to IH5UserBlock
32 @classmethod
33 def from_userblock(cls, ub: IH5UserBlock, skeleton=None, exts=None) -> IH5Manifest:
34 """Create a manifest file based on a user block."""
35 skeleton = skeleton or {}
36 exts = exts or {}
37 ub_copy = ub.copy()
38 # only keep other extensions (otherwise its circular)
39 ub_copy.ub_exts = {
40 k: v for k, v in ub.ub_exts.items() if k != IH5UBExtManifest.ext_name()
41 }
42 return cls(
43 manifest_uuid=uuid1(),
44 user_block=ub_copy,
45 skeleton=skeleton,
46 manifest_exts=exts,
47 )
49 def __bytes__(self) -> bytes:
50 """Serialize to JSON and return UTF-8 encoded bytes to be written in a file."""
51 # add a newline, as otherwise behaviour with text editors will be confusing
52 # (e.g. vim automatically adds a trailing newline that it hides)
53 # https://stackoverflow.com/questions/729692/why-should-text-files-end-with-a-newline
54 return (self.json(indent=2) + "\n").encode(encoding="utf-8")
56 def save(self, path: Path):
57 """Save manifest (as returned by bytes()) into a file."""
58 with open(path, "wb") as f:
59 f.write(bytes(self))
60 f.flush()
63class IH5UBExtManifest(BaseModel):
64 """IH5 user block extension for stub and manifest support."""
66 is_stub_container: bool
67 """True if file has the structure of another container, without actual data inside."""
69 manifest_uuid: UUID
70 """UUID of the manifest file that belongs to this IH5 file."""
72 manifest_hashsum: QualHashsumStr
73 """Hashsum of the manifest file that belongs to this IH5 file."""
75 @classmethod
76 def ext_name(cls) -> str:
77 """Name of user block extension section for stub and manifest info."""
78 return "ih5mf_v01"
80 @classmethod
81 def get(cls, ub: IH5UserBlock) -> Optional[IH5UBExtManifest]:
82 """Parse extension metadata from userblock, if it is available."""
83 if cls.ext_name() not in ub.ub_exts:
84 return None
85 return cls.parse_obj(ub.ub_exts[cls.ext_name()])
87 def update(self, ub: IH5UserBlock):
88 """Create or overwrite extension metadata in given userblock."""
89 ub.ub_exts[self.ext_name()] = self.dict()
92class IH5MFRecord(IH5Record):
93 """IH5Record extended by a manifest file.
95 The manifest file is a sidcar JSON file that contains enough information to support
96 the creation of a stub container and patching a dataset without having the actual
97 container locally available.
99 In a chain of container files, only the base container may be a stub.
100 All files without the manifest extension in the userblock are considered not stubs.
102 An IH5MFRecord is a valid IH5Record (the manifest file then is simply ignored).
103 Also, it is possible to open an IH5Record as IH5MFRecord and turn it into a
104 valid IH5MFRecord by committing a patch (this will create the missing manifest).
106 In addition to the ability to create stubs, the manifest file can be used to carry
107 information that should be attached to a container, but is too large or inappropriate
108 for storage in the userblock (e.g. should be available separately).
110 The manifest should store information that applies semantically to the whole fileset
111 at the current patch level, it MUST NOT be required to have manifest files for each
112 ih5 patch. Additional information stored in the manifest is inherited to the
113 manifest of successive patches until overridden.
115 The main use case of this extension is to be used by automated packers
116 from suitably prepared source directories, to be uploaded to remote locations,
117 and for these packers being able to create patches for the record without
118 access to all the data containers (based only on the most recent manifest file).
119 """
121 MANIFEST_EXT: str = "mf.json"
123 _manifest: Optional[IH5Manifest] = None
124 """Manifest of newest loaded container file (only None for new uncommited records)."""
126 @property
127 def manifest(self) -> IH5Manifest:
128 """Return loaded manifest object of latest committed record patch."""
129 if self._manifest is None: # should only happen with fresh create()d records
130 raise ValueError("No manifest exists yet! Did you forget to commit?")
131 return self._manifest
133 def _fresh_manifest(self) -> IH5Manifest:
134 """Return new manifest based on current state of the record."""
135 ub = self._ublock(-1)
136 skel = IH5Skeleton.for_record(self)
137 return IH5Manifest.from_userblock(ub, skeleton=skel, exts={})
139 @classmethod
140 def _manifest_filepath(cls, record: str) -> Path:
141 """Return canonical filename of manifest based on path of a container file."""
142 return Path(f"{str(record)}{cls.MANIFEST_EXT}")
144 # Override to also load and check latest manifest
145 @classmethod
146 def _open(cls, paths: List[Path], **kwargs):
147 manifest_file: Optional[Path] = kwargs.pop("manifest_file", None)
148 ret: IH5MFRecord = super()._open(paths, **kwargs)
150 # if not given explicitly, infer correct manifest filename
151 # based on logically latest container (they are sorted after parent init)
152 if manifest_file is None:
153 manifest_file = cls._manifest_filepath(ret._files[-1].filename)
155 # for latest container, check linked manifest (if any) against given/inferred one
156 ub = ret._ublock(-1)
157 ubext = IH5UBExtManifest.get(ub)
158 if ubext is not None:
159 if not manifest_file.is_file():
160 msg = f"Manifest file {manifest_file} does not exist, cannot open!"
161 raise ValueError(f"{ret._files[-1].filename}: {msg}")
163 chksum = hashsum_file(manifest_file)
164 if ubext.manifest_hashsum != chksum:
165 msg = "Manifest has been modified, unexpected hashsum!"
166 raise ValueError(f"{ret._files[-1].filename}: {msg}")
168 ret._manifest = IH5Manifest.parse_file(manifest_file)
169 # NOTE: as long as we enforce checksum of manifest, this failure can't happen:
170 # if ubext.manifest_uuid != self._manifest.manifest_uuid:
171 # raise ValueError(f"{ub._filename}: Manifest file has wrong UUID!")
172 # all looks good
173 return ret
175 # Override to also check user block extension
176 def _check_ublock(
177 self,
178 filename: Union[str, Path],
179 ub: IH5UserBlock,
180 prev: Optional[IH5UserBlock] = None,
181 check_hashsum: bool = True,
182 ):
183 super()._check_ublock(filename, ub, prev, check_hashsum)
184 # Try getting manifest info in the userblock.
185 # If it is missing, probably we're opening a "raw" IH5Record or a messed up mix
186 ubext = IH5UBExtManifest.get(ub)
187 # we only allow to write patches on top of stubs,
188 # but not have stubs on top of something else.
189 # If something creates a patch that is (marked as) a stub, its a developer error.
190 # If the ub ext is missing, then we must assume that it is not a stub.
191 assert prev is None or ubext is None or not ubext.is_stub_container
193 def _fixes_after_merge(self, file, ub):
194 # if a manifest exists for the current dataset,
195 # copy its manifest to overwrite the fresh one of the merged container
196 # and fix its user block
197 if self._manifest is not None:
198 # check that new userblock inherited the original linked manifest
199 ext = IH5UBExtManifest.get(ub)
200 assert ext is not None and ext.manifest_uuid == self.manifest.manifest_uuid
201 # overwrite the "fresh" manifest from merge with the original one
202 self.manifest.save(self._manifest_filepath(file))
204 # Override to prevent merge if a stub is present
205 def merge_files(self, target: Path):
206 def is_stub(x):
207 ext = IH5UBExtManifest.get(x)
208 # missing ext -> not a stub (valid stub has ext + is marked as stub)
209 return ext is not None and ext.is_stub_container
211 if any(map(is_stub, self.ih5_meta)):
212 raise ValueError("Cannot merge, files contain a stub!")
214 return super().merge_files(target)
216 # Override to create skeleton and dir hashsums, write manifest and add to user block
217 # Will inherit old manifest extensions, unless overridden by passed argument
218 def commit_patch(self, **kwargs) -> None:
219 # is_stub == True only if called from create_stub!!! (NOT for the "end-user"!)
220 is_stub = kwargs.pop("__is_stub__", False)
221 exts = kwargs.pop("manifest_exts", None)
223 # create manifest for the new patch
224 mf = self._fresh_manifest()
225 if self._manifest is not None: # inherit attached data, if manifest exists
226 mf.manifest_exts = self.manifest.manifest_exts
227 if exts is not None: # override, if extensions provided
228 mf.manifest_exts = exts
230 old_ub = self._ublock(-1) # keep ref in case anything goes wrong
231 # prepare new user block that links to the prospective manifest
232 new_ub = old_ub.copy()
233 IH5UBExtManifest(
234 is_stub_container=is_stub,
235 manifest_uuid=mf.manifest_uuid,
236 manifest_hashsum=qualified_hashsum(bytes(mf)),
237 ).update(new_ub)
239 # try writing new container
240 self._set_ublock(-1, new_ub)
241 try:
242 super().commit_patch(**kwargs)
243 except ValueError as e: # some checks failed
244 self._set_ublock(-1, old_ub) # reset current user block
245 raise e
247 # as everything is fine, finally (over)write manifest here and on disk
248 self._manifest = mf
249 mf.save(self._manifest_filepath(self._files[-1].filename))
251 @classmethod
252 def create_stub(
253 cls,
254 record: Union[Path, str],
255 manifest_file: Path,
256 ) -> IH5MFRecord:
257 """Create a stub base container for patching an existing but unavailable record.
259 The stub is based on the user block of a real IH5 record container line
260 and the skeleton of the overlay structure (as returned by `IH5Skeleton`),
261 which are taken from a provided manifest file.
263 Patches created on top of the stub are compatible with the original record
264 whose metadata the stub is based on.
266 The returned container is read-only and only serves as base for patches.
267 """
268 manifest = IH5Manifest.parse_file(manifest_file)
270 skeleton: IH5Skeleton = manifest.skeleton
271 user_block: IH5UserBlock = manifest.user_block.copy()
273 # the manifest-stored user block has no manifest extension itself - create new
274 # based on passed manifest.
275 # mark it as stub in extra metadata now! important to avoid accidents!
276 # must pass it in like that, because the container will be auto-commited.
277 ubext = IH5UBExtManifest(
278 is_stub_container=True, # <- the ONLY place where this is allowed!
279 manifest_uuid=manifest.manifest_uuid,
280 manifest_hashsum=hashsum_file(manifest_file),
281 )
282 ubext.update(user_block)
284 # create and finalize the stub (override userblock and create skeleton structure)
285 ds = IH5MFRecord._create(Path(record))
286 init_stub_base(ds, user_block, skeleton) # prepares structure and user block
287 # commit_patch() completes stub + fixes the hashsum
288 ds.commit_patch(__is_stub__=True)
289 assert not ds._has_writable
291 return ds