Coverage for src/somesy/pom_xml/xmlproxy.py: 94%
190 statements
« prev ^ index » next coverage.py v7.3.2, created at 2024-04-30 09:42 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2024-04-30 09:42 +0000
1"""Wrapper to provide dict-like access to XML via ElementTree."""
2from __future__ import annotations
4import xml.etree.ElementTree as ET
5from pathlib import Path
6from typing import Any, List, Optional, Union, cast
8import defusedxml.ElementTree as DET
10# shallow type hint mostly for documentation purpose
11JSONLike = Any
14def load_xml(path: Path) -> ET.ElementTree:
15 """Parse an XML file into an ElementTree, preserving comments."""
16 path = path if isinstance(path, Path) else Path(path)
17 parser = DET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
18 return DET.parse(path, parser=parser)
21def indent(elem, level=0):
22 """Indent the elements of this XML node (i.e. pretty print)."""
23 i = "\n" + level * " "
24 if len(elem):
25 if not elem.text or not elem.text.strip():
26 elem.text = i + " "
27 if not elem.tail or not elem.tail.strip():
28 elem.tail = i
29 for el in elem:
30 indent(el, level + 1)
31 if not el.tail or not el.tail.strip():
32 el.tail = i
33 else:
34 if level and (not elem.tail or not elem.tail.strip()):
35 elem.tail = i
38class XMLProxy:
39 """Class providing dict-like access to edit XML via ElementTree.
41 Note that this wrapper facade is limited to a restricted (but useful) subset of XML:
42 * XML attributes are not supported
43 * DTDs are ignored (arbitrary keys can be queried and added)
44 * each tag is assumed to EITHER contain text OR more nested tags
45 * lists are treated atomically (no way to add/remove element from a collection)
47 The semantics is implemented as follows:
49 * If there are multiple tags with the same name, a list of XMLProxy nodes is returned
50 * If a unique tag does have no nested tags, its `text` string value is returned
51 * Otherwise, the node is returned
52 """
54 def _wrap(self, el: ET.Element) -> XMLProxy:
55 """Wrap a different element, inheriting the same namespace."""
56 return XMLProxy(el, default_namespace=self._def_ns)
58 def _dump(self):
59 """Dump XML to stdout (for debugging)."""
60 ET.dump(self._node)
62 def _qualified_key(self, key: str):
63 """If passed key is not qualified, prepends the default namespace (if set)."""
64 if key[0] == "{" or not self._def_ns:
65 return key
66 return "{" + self._def_ns + "}" + key
68 def _shortened_key(self, key: str):
69 """Inverse of `_qualified_key` (strips default namespace from element name)."""
70 if key[0] != "{" or not self._def_ns or key.find(self._def_ns) < 0:
71 return key
72 return key[key.find("}") + 1 :]
74 # ----
76 def __init__(self, el: ET.Element, *, default_namespace: Optional[str] = None):
77 """Wrap an existing XML ElementTree Element."""
78 self._node: ET.Element = el
79 self._def_ns = default_namespace
81 @classmethod
82 def parse(cls, path: Union[str, Path], **kwargs) -> XMLProxy:
83 """Parse an XML file into a wrapped ElementTree, preserving comments."""
84 path = path if isinstance(path, Path) else Path(path)
85 return cls(load_xml(path).getroot(), **kwargs)
87 def write(self, path: Union[str, Path], *, header: bool = True, **kwargs):
88 """Write the XML DOM to an UTF-8 encoded file."""
89 path = path if isinstance(path, Path) else Path(path)
90 et = ET.ElementTree(self._node)
91 if self._def_ns and "default_namespace" not in kwargs:
92 kwargs["default_namespace"] = self._def_ns
93 indent(et.getroot())
94 et.write(path, encoding="UTF-8", xml_declaration=header, **kwargs)
96 def __repr__(self):
97 """See `object.__repr__`."""
98 return str(self._node)
100 def __len__(self):
101 """Return number of inner tags inside current XML element.
103 Note that bool(node) thus checks whether an XML node is a leaf in the element tree.
104 """
105 return len(self._node)
107 def __iter__(self):
108 """Iterate the nested elements in-order."""
109 return map(self._wrap, iter(self._node))
111 @property
112 def namespace(self) -> Optional[str]:
113 """Default namespace of this node."""
114 return self._def_ns
116 @property
117 def is_comment(self):
118 """Return whether the current element node is an XML comment."""
119 return not isinstance(self._node.tag, str)
121 @property
122 def tag(self) -> Optional[str]:
123 """Return tag name of this element (unless it is a comment)."""
124 if self.is_comment:
125 return None
126 return self._shortened_key(self._node.tag)
128 @tag.setter
129 def tag(self, val: str):
130 """Set the tag of this element."""
131 if self.is_comment:
132 raise ValueError("Cannot set tag name for comment element!")
133 self._node.tag = self._qualified_key(val)
135 # ---- helpers ----
137 def to_jsonlike(
138 self,
139 *,
140 strip_default_ns: bool = True,
141 keep_root: bool = False,
142 ) -> JSONLike:
143 """Convert XML node to a JSON-like primitive, array or dict (ignoring attributes).
145 Note that all leaf values are strings (i.e. not parsed to bool/int/float etc.).
147 Args:
148 strip_default_ns: Do not qualify keys from the default namespace
149 keep_root: If true, the root tag name will be preserved (`{"root_tag": {...}}`)
150 """
151 if not len(self): # leaf -> assume it's a primitive value
152 return self._node.text or ""
154 dct = {}
155 ccnt = 0
156 for elem in iter(self):
157 raw = elem._node
158 if not isinstance(raw.tag, str):
159 ccnt += 1
160 key = f"__comment_{ccnt}__"
161 else:
162 key = raw.tag if not strip_default_ns else self._shortened_key(raw.tag)
164 curr_val = elem.to_jsonlike(strip_default_ns=strip_default_ns)
165 if key not in dct:
166 dct[key] = curr_val
167 continue
168 val = dct[key]
169 if not isinstance(val, list):
170 dct[key] = [dct[key]]
171 dct[key].append(curr_val)
173 return dct if not keep_root else {self._shortened_key(self._node.tag): dct}
175 @classmethod
176 def _from_jsonlike_primitive(
177 cls, val, *, elem_name: Optional[str] = None, **kwargs
178 ) -> Union[str, XMLProxy]:
179 """Convert a leaf node into a string value (i.e. return inner text).
181 Returns a string (or an XML element, if elem_name is passed).
182 """
183 if val is None:
184 ret = "" # turn None into empty string
185 elif isinstance(val, str):
186 ret = val
187 elif isinstance(val, bool):
188 ret = str(val).lower() # True -> true / False -> false
189 elif isinstance(val, (int, float)):
190 ret = str(val)
191 else:
192 raise TypeError(
193 f"Value of type {type(val)} is not JSON-like primitive: {val}"
194 )
196 if not elem_name:
197 return ret
198 else: # return the value wrapped as an element (needed in from_jsonlike)
199 elem = ET.Element(elem_name)
200 elem.text = ret
201 return cls(elem, **kwargs)
203 @classmethod
204 def from_jsonlike(
205 cls, val: JSONLike, *, root_name: Optional[str] = None, **kwargs: Any
206 ):
207 """Convert a JSON-like primitive, array or dict into an XML element.
209 Note that booleans are serialized as `true`/`false` and None as `null`.
211 Args:
212 val: Value to convert into an XML element.
213 root_name: If `val` is a dict, defines the tag name for the root element.
214 kwargs: Additional arguments for XML element instantiation.
215 """
216 if isinstance(val, list):
217 return list(
218 map(lambda x: cls.from_jsonlike(x, root_name=root_name, **kwargs), val)
219 )
220 if not isinstance(val, dict): # primitive val
221 return cls._from_jsonlike_primitive(val, elem_name=root_name, **kwargs)
223 # now the dict case remains
224 elem = ET.Element(root_name or "root")
225 for k, v in val.items():
226 if k.startswith(
227 "__comment_"
228 ): # special key names are mapped to XML comments
229 elem.append(ET.Comment(v if isinstance(v, str) else str(v)))
231 elif isinstance(v, list):
232 for vv in XMLProxy.from_jsonlike(v, root_name=k, **kwargs):
233 elem.append(vv._node)
234 elif not isinstance(v, dict): # primitive val
235 # FIXME: use better case-splitting for type of function to avoid cast
236 tmp = cast(
237 XMLProxy,
238 XMLProxy._from_jsonlike_primitive(v, elem_name=k, **kwargs),
239 )
240 elem.append(tmp._node)
241 else: # dict
242 elem.append(XMLProxy.from_jsonlike(v, root_name=k)._node)
244 return cls(elem, **kwargs)
246 # ---- dict-like access ----
248 def get(self, key: str, *, as_nodes: bool = False, deep: bool = False):
249 """Get sub-structure(s) of value(s) matching desired XML tag name.
251 * If there are multiple matching elements, will return them all as a list.
252 * If there is a single matching element, will return that element without a list.
254 Args:
255 key: tag name to retrieve
256 as_nodes: If true, will *always* return a list of (zero or more) XML nodes
257 deep: Expand nested XML elements instead of returning them as XML nodes
258 """
259 # NOTE: could allow to retrieve comments when using empty string/none as key?
261 if as_nodes and deep:
262 raise ValueError("as_nodes and deep are mutually exclusive!")
263 if not key:
264 raise ValueError("Key must not be an empty string!")
265 key = self._qualified_key(key)
267 # if not fully qualified + default NS is given, use it for query
268 lst = self._node.findall(key)
269 ns: List[XMLProxy] = list(map(self._wrap, lst))
270 if as_nodes: # return it as a list of xml nodes
271 return ns
272 if not ns: # no element
273 return None
275 ret = ns if not deep else [x.to_jsonlike() for x in ns]
276 if len(ret) == 1:
277 return ret[0] # single element
278 else:
279 return ret
281 def __getitem__(self, key: str):
282 """Acts like `dict.__getitem__`, implemented with `get`."""
283 val = self.get(key)
284 if val is not None:
285 return val
286 else:
287 raise KeyError(key)
289 def __contains__(self, key: str) -> bool:
290 """Acts like `dict.__contains__`, implemented with `get`."""
291 return self.get(key) is not None
293 def __delitem__(self, key: Union[str, XMLProxy]):
294 """Delete a nested XML element with matching key name.
296 Note that **all** XML elements with the given tag name are removed!
298 To prevent this behavior, instead of a string tag name you can provide the
299 exact element to be removed, i.e. if a node `node_a` represents the following XML:
301 ```
302 <a>
303 <b>1</b>
304 <c>2</c>
305 <b>3</b>
306 </a>
307 ```
309 Then we have that:
311 * `del node_a["b"]` removes **both** tags, leaving just the `c` tag.
312 * `del node_a[node_a["a"][1]]` removes just the second tag with the `3`.
313 """
314 if isinstance(key, str):
315 nodes = self.get(key, as_nodes=True)
316 else:
317 nodes = [key] if key._node in self._node else []
319 if not nodes:
320 raise KeyError(key)
322 if self._node.text is not None:
323 self._node.text = ""
324 for child in nodes:
325 self._node.remove(child._node)
327 def _clear(self):
328 """Remove contents of this XML element (e.g. for overwriting in-place)."""
329 self._node.text = ""
330 children = list(iter(self._node)) # need to store, removal invalidates iterator
331 for child in children:
332 self._node.remove(child)
334 def __setitem__(self, key: Union[str, XMLProxy], val: Union[JSONLike, XMLProxy]):
335 """Add or overwrite an inner XML tag.
337 If there is exactly one matching tag, the value is substituted in-place.
338 If the passed value is a list, all list entries are added in their own element.
340 If there are multiple existing matches or target values, then
341 **all** existing elements are removed and the new value(s) are added in
342 new element(s) (i.e. coming after other unrelated existing elements)!
344 To prevent this behavior, instead of a string tag name you can provide the
345 exact element to be overwritten, i.e. if a node `node_a` represents the following XML:
347 ```
348 <a>
349 <b>1</b>
350 <c>2</c>
351 <b>3</b>
352 </a>
353 ```
355 Then we have that:
357 * `node_a["b"] = 5` removes both existing tags and creates a new tag with the passed value(s).
358 * `node_a[node_a["b"][1]] = 5` replaces the `3` in the second tag with the `5`.
360 Note that the passed value must be either an XML element already, or be a pure JSON-like object.
361 """
362 if isinstance(key, str):
363 nodes = self.get(key, as_nodes=True)
364 # delete all existing elements if multiple exist or are passed
365 if len(nodes) > 1 or (len(nodes) and isinstance(val, list)):
366 del self[key]
367 nodes = []
368 # now we can assume there's zero or one suitable target elements
369 if nodes: # if it is one, clear it out
370 nodes[0]._clear()
371 else: # an XMLProxy object was passed as key -> try to use that
372 if isinstance(val, list):
373 raise ValueError(
374 "Cannot overwrite a single element with a list of values!"
375 )
376 # ensure the target node is cleared out and use it as target
377 key._clear()
378 nodes = [key]
379 key = key.tag
381 # ensure key string is qualified with a namespace
382 key_name: str = self._qualified_key(key)
384 # normalize passed value(s) to be list (general case)
385 vals = val if isinstance(val, list) else [val]
387 # ensure there is the required number of target element nodes
388 for _ in range(len(vals) - len(nodes)):
389 nodes.append(self._wrap(ET.SubElement(self._node, key_name)))
391 # normalize values no XML element nodes
392 nvals = []
393 for val in vals:
394 # ensure value is represented as an XML node
395 if isinstance(val, XMLProxy):
396 obj = self._wrap(ET.Element("dummy"))
397 obj._node.append(val._node)
398 else:
399 obj = self.from_jsonlike(val, root_name=key_name)
401 nvals.append(obj)
403 for node, val in zip(nodes, nvals):
404 # transplant node contents into existing element (so it is inserted in-place)
405 node._node.text = val._node.text
406 for child in iter(val):
407 node._node.append(child._node)