26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141 | class GenericPacker(Packer):
"""The generic packer is demonstrating how a packer can be implemented.
It will pack CSV tables with metadata into corresponding HDF5 containers,
and it will pack all other kinds of files as embedded opaque blobs.
Both kinds of nodes will have corresponding metadata attributes attached.
The directory is expected to have a _meta.yaml file in the container root
and each CSV file file.csv needs a companion metadata file file.csv_meta.yaml.
All symlinks inside the directory are completely ignored.
This packer does very verbose logging for didactic purposes.
Other packers may log their actions as they deem appropriate.
"""
class Plugin:
name = "core.generic"
version = (0, 1, 0)
META_SUFFIX: str = "_meta.yaml"
@classmethod
def sidecar_for(cls, path: Union[Path, str]) -> str:
"""Sidecar file name for given path."""
return f"{path}{cls.META_SUFFIX}"
@classmethod
@overrides
def check_dir(cls, data_dir: Path) -> DirValidationErrors:
print("--------")
print("called check_dir")
errs = DirValidationErrors()
errs.update(
check_metadata_file(
data_dir / cls.META_SUFFIX, required=True, schema=BibMeta
)
)
return errs
@classmethod
@overrides
def update(cls, mc: MetadorContainer, data_dir: Path, diff: DirDiff):
print("--------")
print("called update")
for path, dnode in diff.annotate(data_dir).items():
# the status indicates whether the file was added, removed or modified
status = diff.status(dnode)
print(status.value, path)
if dnode is None: # unchanged paths in the directory have no diff node
print("IGNORE:", path, "(unchanged)")
continue # nothing to do
if path.is_symlink(): # we ignore symlinks in the data directory
print("IGNORE:", path, "(symlink)")
continue
if path.name.lower().endswith(".csv_meta.yaml"):
# will be taken care of when the CSV file is processed
print("IGNORE:", path, "(sidecar file)")
continue
# for this simple packer, each file maps 1-to-1 to a container path
key = f"{dnode.path}" # path inside the container
if status == DiffNode.Status.removed: # entity was removed ->
# also remove in container, if it was not a symlink (which we ignored)
if dnode.prev_type != DiffNode.ObjType.symlink:
print("DELETE:", key)
del mc[key]
continue
if status == DiffNode.Status.modified: # changed
if dnode.prev_type == dnode.curr_type and path.is_dir():
continue # a changed dir should already exist + remain in container
# otherwise it was replaced either file -> dir or dir -> file, so
# remove entity, proceeding with loop body to add new entity version
print("DELETE:", key)
del mc[key]
# now we (re-)add new or modified entities:
if path.is_dir():
print("CREATE:", path, "->", key, "(dir)")
mc.create_group(key)
elif path.is_file():
if path.name.endswith(cls.META_SUFFIX):
if key == cls.META_SUFFIX:
# update root meta
print("CREATE:", path, "->", key, "(biblio metadata)")
mc.meta["common_biblio"] = BibMeta.parse_file(path)
else:
if path.name.lower().endswith(".csv"):
# embed CSV as numpy array with table metadata
print("CREATE:", path, "->", key, "(table)")
mc[key] = pandas.read_csv(path).to_numpy() # type: ignore
mc[key].meta["common_table"] = TableMeta.for_file(
cls.sidecar_for(path)
)
elif path.name.lower().endswith((".jpg", ".jpeg", ".png")):
# embed image file with image-specific metadata
print("CREATE:", path, "->", key, "(image)")
embed_file(mc, key, path)
# mc[key].meta["common_image"] = image_meta_for(path)
else:
# treat as opaque blob and add file metadata
print("CREATE:", path, "->", key, "(file)")
embed_file(mc, key, path)
|