Coverage for src/metador_core/ih5/manifest.py: 100%

125 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-02 09:33 +0000

1"""Sidecar JSON file storing a skeleton to create stubs and patch containers.""" 

2from __future__ import annotations 

3 

4from pathlib import Path 

5from typing import Any, Dict, List, Optional, Union 

6from uuid import UUID, uuid1 

7 

8from pydantic import BaseModel 

9 

10from ..schema.types import QualHashsumStr 

11from ..util.hashsums import qualified_hashsum 

12from .record import IH5Record, IH5UserBlock, hashsum_file 

13from .skeleton import IH5Skeleton, init_stub_base 

14 

15 

16class IH5Manifest(BaseModel): 

17 """A metadata sidecar file for a collection of IH5 record files. 

18 

19 It contains the skeleton of the container, in order to be able to create a 

20 patch for an IH5Record the data is not locally available for (but manifest is). 

21 """ 

22 

23 # uuid for the manifest file itself (so the filename does not matter) 

24 manifest_uuid: UUID 

25 

26 user_block: IH5UserBlock # copy of user block (without the manifest extension part) 

27 

28 skeleton: IH5Skeleton # computed with IH5Skeleton, used to create a stub 

29 

30 manifest_exts: Dict[str, Any] # Arbitrary extensions, similar to IH5UserBlock 

31 

32 @classmethod 

33 def from_userblock(cls, ub: IH5UserBlock, skeleton=None, exts=None) -> IH5Manifest: 

34 """Create a manifest file based on a user block.""" 

35 skeleton = skeleton or {} 

36 exts = exts or {} 

37 ub_copy = ub.copy() 

38 # only keep other extensions (otherwise its circular) 

39 ub_copy.ub_exts = { 

40 k: v for k, v in ub.ub_exts.items() if k != IH5UBExtManifest.ext_name() 

41 } 

42 return cls( 

43 manifest_uuid=uuid1(), 

44 user_block=ub_copy, 

45 skeleton=skeleton, 

46 manifest_exts=exts, 

47 ) 

48 

49 def __bytes__(self) -> bytes: 

50 """Serialize to JSON and return UTF-8 encoded bytes to be written in a file.""" 

51 # add a newline, as otherwise behaviour with text editors will be confusing 

52 # (e.g. vim automatically adds a trailing newline that it hides) 

53 # https://stackoverflow.com/questions/729692/why-should-text-files-end-with-a-newline 

54 return (self.json(indent=2) + "\n").encode(encoding="utf-8") 

55 

56 def save(self, path: Path): 

57 """Save manifest (as returned by bytes()) into a file.""" 

58 with open(path, "wb") as f: 

59 f.write(bytes(self)) 

60 f.flush() 

61 

62 

63class IH5UBExtManifest(BaseModel): 

64 """IH5 user block extension for stub and manifest support.""" 

65 

66 is_stub_container: bool 

67 """True if file has the structure of another container, without actual data inside.""" 

68 

69 manifest_uuid: UUID 

70 """UUID of the manifest file that belongs to this IH5 file.""" 

71 

72 manifest_hashsum: QualHashsumStr 

73 """Hashsum of the manifest file that belongs to this IH5 file.""" 

74 

75 @classmethod 

76 def ext_name(cls) -> str: 

77 """Name of user block extension section for stub and manifest info.""" 

78 return "ih5mf_v01" 

79 

80 @classmethod 

81 def get(cls, ub: IH5UserBlock) -> Optional[IH5UBExtManifest]: 

82 """Parse extension metadata from userblock, if it is available.""" 

83 if cls.ext_name() not in ub.ub_exts: 

84 return None 

85 return cls.parse_obj(ub.ub_exts[cls.ext_name()]) 

86 

87 def update(self, ub: IH5UserBlock): 

88 """Create or overwrite extension metadata in given userblock.""" 

89 ub.ub_exts[self.ext_name()] = self.dict() 

90 

91 

92class IH5MFRecord(IH5Record): 

93 """IH5Record extended by a manifest file. 

94 

95 The manifest file is a sidcar JSON file that contains enough information to support 

96 the creation of a stub container and patching a dataset without having the actual 

97 container locally available. 

98 

99 In a chain of container files, only the base container may be a stub. 

100 All files without the manifest extension in the userblock are considered not stubs. 

101 

102 An IH5MFRecord is a valid IH5Record (the manifest file then is simply ignored). 

103 Also, it is possible to open an IH5Record as IH5MFRecord and turn it into a 

104 valid IH5MFRecord by committing a patch (this will create the missing manifest). 

105 

106 In addition to the ability to create stubs, the manifest file can be used to carry 

107 information that should be attached to a container, but is too large or inappropriate 

108 for storage in the userblock (e.g. should be available separately). 

109 

110 The manifest should store information that applies semantically to the whole fileset 

111 at the current patch level, it MUST NOT be required to have manifest files for each 

112 ih5 patch. Additional information stored in the manifest is inherited to the 

113 manifest of successive patches until overridden. 

114 

115 The main use case of this extension is to be used by automated packers 

116 from suitably prepared source directories, to be uploaded to remote locations, 

117 and for these packers being able to create patches for the record without 

118 access to all the data containers (based only on the most recent manifest file). 

119 """ 

120 

121 MANIFEST_EXT: str = "mf.json" 

122 

123 _manifest: Optional[IH5Manifest] = None 

124 """Manifest of newest loaded container file (only None for new uncommited records).""" 

125 

126 @property 

127 def manifest(self) -> IH5Manifest: 

128 """Return loaded manifest object of latest committed record patch.""" 

129 if self._manifest is None: # should only happen with fresh create()d records 

130 raise ValueError("No manifest exists yet! Did you forget to commit?") 

131 return self._manifest 

132 

133 def _fresh_manifest(self) -> IH5Manifest: 

134 """Return new manifest based on current state of the record.""" 

135 ub = self._ublock(-1) 

136 skel = IH5Skeleton.for_record(self) 

137 return IH5Manifest.from_userblock(ub, skeleton=skel, exts={}) 

138 

139 @classmethod 

140 def _manifest_filepath(cls, record: str) -> Path: 

141 """Return canonical filename of manifest based on path of a container file.""" 

142 return Path(f"{str(record)}{cls.MANIFEST_EXT}") 

143 

144 # Override to also load and check latest manifest 

145 @classmethod 

146 def _open(cls, paths: List[Path], **kwargs): 

147 manifest_file: Optional[Path] = kwargs.pop("manifest_file", None) 

148 ret: IH5MFRecord = super()._open(paths, **kwargs) 

149 

150 # if not given explicitly, infer correct manifest filename 

151 # based on logically latest container (they are sorted after parent init) 

152 if manifest_file is None: 

153 manifest_file = cls._manifest_filepath(ret._files[-1].filename) 

154 

155 # for latest container, check linked manifest (if any) against given/inferred one 

156 ub = ret._ublock(-1) 

157 ubext = IH5UBExtManifest.get(ub) 

158 if ubext is not None: 

159 if not manifest_file.is_file(): 

160 msg = f"Manifest file {manifest_file} does not exist, cannot open!" 

161 raise ValueError(f"{ret._files[-1].filename}: {msg}") 

162 

163 chksum = hashsum_file(manifest_file) 

164 if ubext.manifest_hashsum != chksum: 

165 msg = "Manifest has been modified, unexpected hashsum!" 

166 raise ValueError(f"{ret._files[-1].filename}: {msg}") 

167 

168 ret._manifest = IH5Manifest.parse_file(manifest_file) 

169 # NOTE: as long as we enforce checksum of manifest, this failure can't happen: 

170 # if ubext.manifest_uuid != self._manifest.manifest_uuid: 

171 # raise ValueError(f"{ub._filename}: Manifest file has wrong UUID!") 

172 # all looks good 

173 return ret 

174 

175 # Override to also check user block extension 

176 def _check_ublock( 

177 self, 

178 filename: Union[str, Path], 

179 ub: IH5UserBlock, 

180 prev: Optional[IH5UserBlock] = None, 

181 check_hashsum: bool = True, 

182 ): 

183 super()._check_ublock(filename, ub, prev, check_hashsum) 

184 # Try getting manifest info in the userblock. 

185 # If it is missing, probably we're opening a "raw" IH5Record or a messed up mix 

186 ubext = IH5UBExtManifest.get(ub) 

187 # we only allow to write patches on top of stubs, 

188 # but not have stubs on top of something else. 

189 # If something creates a patch that is (marked as) a stub, its a developer error. 

190 # If the ub ext is missing, then we must assume that it is not a stub. 

191 assert prev is None or ubext is None or not ubext.is_stub_container 

192 

193 def _fixes_after_merge(self, file, ub): 

194 # if a manifest exists for the current dataset, 

195 # copy its manifest to overwrite the fresh one of the merged container 

196 # and fix its user block 

197 if self._manifest is not None: 

198 # check that new userblock inherited the original linked manifest 

199 ext = IH5UBExtManifest.get(ub) 

200 assert ext is not None and ext.manifest_uuid == self.manifest.manifest_uuid 

201 # overwrite the "fresh" manifest from merge with the original one 

202 self.manifest.save(self._manifest_filepath(file)) 

203 

204 # Override to prevent merge if a stub is present 

205 def merge_files(self, target: Path): 

206 def is_stub(x): 

207 ext = IH5UBExtManifest.get(x) 

208 # missing ext -> not a stub (valid stub has ext + is marked as stub) 

209 return ext is not None and ext.is_stub_container 

210 

211 if any(map(is_stub, self.ih5_meta)): 

212 raise ValueError("Cannot merge, files contain a stub!") 

213 

214 return super().merge_files(target) 

215 

216 # Override to create skeleton and dir hashsums, write manifest and add to user block 

217 # Will inherit old manifest extensions, unless overridden by passed argument 

218 def commit_patch(self, **kwargs) -> None: 

219 # is_stub == True only if called from create_stub!!! (NOT for the "end-user"!) 

220 is_stub = kwargs.pop("__is_stub__", False) 

221 exts = kwargs.pop("manifest_exts", None) 

222 

223 # create manifest for the new patch 

224 mf = self._fresh_manifest() 

225 if self._manifest is not None: # inherit attached data, if manifest exists 

226 mf.manifest_exts = self.manifest.manifest_exts 

227 if exts is not None: # override, if extensions provided 

228 mf.manifest_exts = exts 

229 

230 old_ub = self._ublock(-1) # keep ref in case anything goes wrong 

231 # prepare new user block that links to the prospective manifest 

232 new_ub = old_ub.copy() 

233 IH5UBExtManifest( 

234 is_stub_container=is_stub, 

235 manifest_uuid=mf.manifest_uuid, 

236 manifest_hashsum=qualified_hashsum(bytes(mf)), 

237 ).update(new_ub) 

238 

239 # try writing new container 

240 self._set_ublock(-1, new_ub) 

241 try: 

242 super().commit_patch(**kwargs) 

243 except ValueError as e: # some checks failed 

244 self._set_ublock(-1, old_ub) # reset current user block 

245 raise e 

246 

247 # as everything is fine, finally (over)write manifest here and on disk 

248 self._manifest = mf 

249 mf.save(self._manifest_filepath(self._files[-1].filename)) 

250 

251 @classmethod 

252 def create_stub( 

253 cls, 

254 record: Union[Path, str], 

255 manifest_file: Path, 

256 ) -> IH5MFRecord: 

257 """Create a stub base container for patching an existing but unavailable record. 

258 

259 The stub is based on the user block of a real IH5 record container line 

260 and the skeleton of the overlay structure (as returned by `IH5Skeleton`), 

261 which are taken from a provided manifest file. 

262 

263 Patches created on top of the stub are compatible with the original record 

264 whose metadata the stub is based on. 

265 

266 The returned container is read-only and only serves as base for patches. 

267 """ 

268 manifest = IH5Manifest.parse_file(manifest_file) 

269 

270 skeleton: IH5Skeleton = manifest.skeleton 

271 user_block: IH5UserBlock = manifest.user_block.copy() 

272 

273 # the manifest-stored user block has no manifest extension itself - create new 

274 # based on passed manifest. 

275 # mark it as stub in extra metadata now! important to avoid accidents! 

276 # must pass it in like that, because the container will be auto-commited. 

277 ubext = IH5UBExtManifest( 

278 is_stub_container=True, # <- the ONLY place where this is allowed! 

279 manifest_uuid=manifest.manifest_uuid, 

280 manifest_hashsum=hashsum_file(manifest_file), 

281 ) 

282 ubext.update(user_block) 

283 

284 # create and finalize the stub (override userblock and create skeleton structure) 

285 ds = IH5MFRecord._create(Path(record)) 

286 init_stub_base(ds, user_block, skeleton) # prepares structure and user block 

287 # commit_patch() completes stub + fixes the hashsum 

288 ds.commit_patch(__is_stub__=True) 

289 assert not ds._has_writable 

290 

291 return ds