Coverage for src/metador_core/packer/utils.py: 65%
52 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 09:33 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-02 09:33 +0000
1"""Various helper functions that packers can use."""
3from __future__ import annotations
5import urllib.parse
6from json.decoder import JSONDecodeError
7from pathlib import Path
8from typing import Optional, Union
10import h5py
11import numpy
12from pydantic import ValidationError
14from ..container import MetadorContainer, MetadorDataset, MetadorGroup
15from ..harvester import harvest
16from ..plugins import harvesters, schemas
17from ..schema import MetadataSchema
18from .types import DirValidationErrors
21def _h5_wrap_bytes(bs: bytes):
22 """Wrap bytes with numpy.void if non-empty, or return h5py.Empty."""
23 # NOTE: need to think what to do in case we provide non-HDF backends
24 # should not be provided directly to users! it's an internal detail!
25 return numpy.void(bs) if len(bs) else h5py.Empty("b")
28def check_metadata_file(path: Path, **kwargs):
29 """Check a metadata file, return error object.
31 If `required` is set, will add an error if file is missing.
33 If `schema` is passed and file exists, will validate the file and log errors.
35 Combine both to check that a file does exist and is valid according to a schema.
36 """
37 required: bool = kwargs.get("required", False)
38 schema: MetadataSchema = kwargs.get("schema", None)
39 errs = DirValidationErrors()
41 exists = path.is_file()
42 if required and not exists:
43 errs.add(str(path), f"Required metadata file not found: '{path}'")
44 if schema is not None and exists:
45 try:
46 schema.parse_file(path)
47 except (JSONDecodeError, ValidationError, FileNotFoundError) as e:
48 errs.add(str(path), str(e))
50 return errs
53FileMeta = schemas.get("core.file", (0, 1, 0))
56def pack_file(
57 node: Union[MetadorContainer, MetadorGroup],
58 file_path: Union[Path, str],
59 *,
60 target: Optional[str] = None,
61 metadata: Optional[MetadataSchema] = None,
62) -> MetadorDataset:
63 """Embed a file, adding minimal generic metadata to it.
65 Will also ensure that the attached metadata has RO-Crate compatible @id set correctly.
67 Args:
68 node: Container where to embed the file contents
69 file_path: Path of an existing file to be embedded
70 target: Fresh path in container where to place the file
71 metadata: If provided, will attach this instead of harvesting defaults.
73 Returns:
74 Dataset of new embedded file.
75 """
76 file_path = Path(file_path)
78 # if no target path given, use <current node path / file name>
79 if not target:
80 target = file_path.name
82 # check container and file
83 if target in node:
84 raise ValueError(f"Path '{node}' already exists in given container or group!")
85 if not file_path.is_file():
86 raise ValueError(f"Path '{file_path}' does not look like an existing file!")
88 if not metadata:
89 # no metadata given -> harvest minimal information about a file
90 hv_file = harvesters["core.file.generic"]
91 metadata = harvest(FileMeta, [hv_file(filepath=file_path)])
92 else:
93 metadata = metadata.copy() # defensive copying!
95 # check metadata
96 if not isinstance(metadata, FileMeta):
97 msg = f"Provided metadata is not compatible with '{FileMeta.Plugin.name}'!"
98 raise ValueError(msg)
99 if not schemas.is_plugin(type(metadata)):
100 msg = f"Given metadata is a {type(metadata)}, which is not a schema plugin!"
101 raise ValueError(msg)
103 data = _h5_wrap_bytes(file_path.read_bytes())
104 ret = node.create_dataset(target, data=data)
106 # set file metadata @id to be relative to dataset root just like RO Crate wants
107 metadata.id_ = urllib.parse.quote(f".{ret.name}")
109 # embed file and metadata in container:
110 ret.meta[metadata.Plugin.name] = metadata
111 return ret