Coverage for src/metador_core/packer/utils.py: 65%

52 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-02 09:33 +0000

1"""Various helper functions that packers can use.""" 

2 

3from __future__ import annotations 

4 

5import urllib.parse 

6from json.decoder import JSONDecodeError 

7from pathlib import Path 

8from typing import Optional, Union 

9 

10import h5py 

11import numpy 

12from pydantic import ValidationError 

13 

14from ..container import MetadorContainer, MetadorDataset, MetadorGroup 

15from ..harvester import harvest 

16from ..plugins import harvesters, schemas 

17from ..schema import MetadataSchema 

18from .types import DirValidationErrors 

19 

20 

21def _h5_wrap_bytes(bs: bytes): 

22 """Wrap bytes with numpy.void if non-empty, or return h5py.Empty.""" 

23 # NOTE: need to think what to do in case we provide non-HDF backends 

24 # should not be provided directly to users! it's an internal detail! 

25 return numpy.void(bs) if len(bs) else h5py.Empty("b") 

26 

27 

28def check_metadata_file(path: Path, **kwargs): 

29 """Check a metadata file, return error object. 

30 

31 If `required` is set, will add an error if file is missing. 

32 

33 If `schema` is passed and file exists, will validate the file and log errors. 

34 

35 Combine both to check that a file does exist and is valid according to a schema. 

36 """ 

37 required: bool = kwargs.get("required", False) 

38 schema: MetadataSchema = kwargs.get("schema", None) 

39 errs = DirValidationErrors() 

40 

41 exists = path.is_file() 

42 if required and not exists: 

43 errs.add(str(path), f"Required metadata file not found: '{path}'") 

44 if schema is not None and exists: 

45 try: 

46 schema.parse_file(path) 

47 except (JSONDecodeError, ValidationError, FileNotFoundError) as e: 

48 errs.add(str(path), str(e)) 

49 

50 return errs 

51 

52 

53FileMeta = schemas.get("core.file", (0, 1, 0)) 

54 

55 

56def pack_file( 

57 node: Union[MetadorContainer, MetadorGroup], 

58 file_path: Union[Path, str], 

59 *, 

60 target: Optional[str] = None, 

61 metadata: Optional[MetadataSchema] = None, 

62) -> MetadorDataset: 

63 """Embed a file, adding minimal generic metadata to it. 

64 

65 Will also ensure that the attached metadata has RO-Crate compatible @id set correctly. 

66 

67 Args: 

68 node: Container where to embed the file contents 

69 file_path: Path of an existing file to be embedded 

70 target: Fresh path in container where to place the file 

71 metadata: If provided, will attach this instead of harvesting defaults. 

72 

73 Returns: 

74 Dataset of new embedded file. 

75 """ 

76 file_path = Path(file_path) 

77 

78 # if no target path given, use <current node path / file name> 

79 if not target: 

80 target = file_path.name 

81 

82 # check container and file 

83 if target in node: 

84 raise ValueError(f"Path '{node}' already exists in given container or group!") 

85 if not file_path.is_file(): 

86 raise ValueError(f"Path '{file_path}' does not look like an existing file!") 

87 

88 if not metadata: 

89 # no metadata given -> harvest minimal information about a file 

90 hv_file = harvesters["core.file.generic"] 

91 metadata = harvest(FileMeta, [hv_file(filepath=file_path)]) 

92 else: 

93 metadata = metadata.copy() # defensive copying! 

94 

95 # check metadata 

96 if not isinstance(metadata, FileMeta): 

97 msg = f"Provided metadata is not compatible with '{FileMeta.Plugin.name}'!" 

98 raise ValueError(msg) 

99 if not schemas.is_plugin(type(metadata)): 

100 msg = f"Given metadata is a {type(metadata)}, which is not a schema plugin!" 

101 raise ValueError(msg) 

102 

103 data = _h5_wrap_bytes(file_path.read_bytes()) 

104 ret = node.create_dataset(target, data=data) 

105 

106 # set file metadata @id to be relative to dataset root just like RO Crate wants 

107 metadata.id_ = urllib.parse.quote(f".{ret.name}") 

108 

109 # embed file and metadata in container: 

110 ret.meta[metadata.Plugin.name] = metadata 

111 return ret