Coverage for src/metador_core/util/hashsums.py: 50%

64 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-02 09:33 +0000

1from __future__ import annotations 

2 

3import hashlib 

4import os 

5from io import BytesIO 

6from pathlib import Path 

7from typing import Any, BinaryIO, Dict, Optional, Union 

8 

9_hash_alg = { 

10 # "md5": hashlib.md5, 

11 # "sha1": hashlib.sha1, 

12 "sha256": hashlib.sha256, 

13 "sha512": hashlib.sha512, 

14} 

15"""Supported hashsum algorithms.""" 

16 

17 

18def hashsum(data: Union[bytes, BinaryIO], alg: str): 

19 """Compute hashsum from given binary file stream using selected algorithm.""" 

20 if isinstance(data, bytes): 

21 data = BytesIO(data) 

22 try: 

23 h = _hash_alg[alg]() 

24 except KeyError: 

25 raise ValueError(f"Unsupported hashsum: {alg}") 

26 

27 while True: 

28 chunk = data.read(h.block_size) 

29 if not chunk: 

30 break 

31 h.update(chunk) 

32 

33 return h.hexdigest() 

34 

35 

36DEF_HASH_ALG = "sha256" 

37"""Algorithm to use and string to prepend to a resulting hashsum.""" 

38 

39 

40def qualified_hashsum(data: Union[bytes, BinaryIO], alg: str = DEF_HASH_ALG): 

41 """Like hashsum, but prepends the algorithm to the string.""" 

42 return f"{alg}:{hashsum(data, alg)}" 

43 

44 

45def file_hashsum(path: Path, alg: str = DEF_HASH_ALG): 

46 with open(path, "rb") as f: 

47 return qualified_hashsum(f, alg) 

48 

49 

50DirHashsums = Dict[str, Any] 

51"""Nested dict representing a directory. 

52 

53str values represent files (checksum) or symlinks (target path), 

54dict values represent sub-directories. 

55""" 

56 

57 

58def rel_symlink(base: Path, dir: Path) -> Optional[Path]: 

59 """From base path and a symlink path, normalize it to be relative to base. 

60 

61 Mainly used to eliminate .. in paths. 

62 

63 If path points outside base, returns None. 

64 """ 

65 path = dir.parent / os.readlink(str(dir)) 

66 try: 

67 return path.resolve().relative_to(base.resolve()) 

68 except ValueError: 

69 return None # link points outside of base directory 

70 

71 

72def dir_hashsums(dir: Path, alg: str = DEF_HASH_ALG) -> DirHashsums: 

73 """Return hashsums of all files. 

74 

75 Resulting paths are relative to the provided `dir`. 

76 

77 In-directory symlinks are treated like files and the target is stored 

78 instead of computing a checksum. 

79 

80 Out-of-directory symlinks are not allowed. 

81 """ 

82 ret: Dict[str, Any] = {} 

83 for path in dir.rglob("*"): 

84 is_file, is_sym = path.is_file(), path.is_symlink() 

85 relpath = path.relative_to(dir) 

86 

87 fname = None 

88 val = "" 

89 

90 if is_file or is_sym: 

91 fname = relpath.name 

92 relpath = relpath.parent # directory dicts to create = up to parent 

93 

94 if is_file: 

95 val = file_hashsum(path, alg) # value = hashsum 

96 elif is_sym: 

97 sym_trg = rel_symlink(dir, path) 

98 if sym_trg is None: 

99 raise ValueError(f"Symlink inside '{dir}' points to the outside!") 

100 val = "symlink:" + str(sym_trg) # value = symlink target 

101 

102 # create nested dicts, if not existing yet 

103 curr = ret 

104 for seg in str(relpath).split("/"): 

105 if seg == ".": 

106 continue 

107 if seg not in curr: 

108 curr[seg] = dict() 

109 curr = curr[seg] 

110 # store file hashsum or symlink target 

111 if is_file or is_sym: 

112 assert fname is not None 

113 curr[fname] = val 

114 

115 return ret 

116 

117 

118# ----