Coverage for src/somesy/pom_xml/xmlproxy.py: 94%
190 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-29 07:42 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-29 07:42 +0000
1"""Wrapper to provide dict-like access to XML via ElementTree."""
3from __future__ import annotations
5import xml.etree.ElementTree as ET
6from pathlib import Path
7from typing import Any, List, Optional, Union, cast
9import defusedxml.ElementTree as DET
11# shallow type hint mostly for documentation purpose
12JSONLike = Any
15def load_xml(path: Path) -> ET.ElementTree:
16 """Parse an XML file into an ElementTree, preserving comments."""
17 path = path if isinstance(path, Path) else Path(path)
18 parser = DET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
19 return DET.parse(path, parser=parser)
22def indent(elem, level=0):
23 """Indent the elements of this XML node (i.e. pretty print)."""
24 i = "\n" + level * " "
25 if len(elem):
26 if not elem.text or not elem.text.strip():
27 elem.text = i + " "
28 if not elem.tail or not elem.tail.strip():
29 elem.tail = i
30 for el in elem:
31 indent(el, level + 1)
32 if not el.tail or not el.tail.strip():
33 el.tail = i
34 else:
35 if level and (not elem.tail or not elem.tail.strip()):
36 elem.tail = i
39class XMLProxy:
40 """Class providing dict-like access to edit XML via ElementTree.
42 Note that this wrapper facade is limited to a restricted (but useful) subset of XML:
43 * XML attributes are not supported
44 * DTDs are ignored (arbitrary keys can be queried and added)
45 * each tag is assumed to EITHER contain text OR more nested tags
46 * lists are treated atomically (no way to add/remove element from a collection)
48 The semantics is implemented as follows:
50 * If there are multiple tags with the same name, a list of XMLProxy nodes is returned
51 * If a unique tag does have no nested tags, its `text` string value is returned
52 * Otherwise, the node is returned
53 """
55 def _wrap(self, el: ET.Element) -> XMLProxy:
56 """Wrap a different element, inheriting the same namespace."""
57 return XMLProxy(el, default_namespace=self._def_ns)
59 def _dump(self):
60 """Dump XML to stdout (for debugging)."""
61 ET.dump(self._node)
63 def _qualified_key(self, key: str):
64 """If passed key is not qualified, prepends the default namespace (if set)."""
65 if key[0] == "{" or not self._def_ns:
66 return key
67 return "{" + self._def_ns + "}" + key
69 def _shortened_key(self, key: str):
70 """Inverse of `_qualified_key` (strips default namespace from element name)."""
71 if key[0] != "{" or not self._def_ns or key.find(self._def_ns) < 0:
72 return key
73 return key[key.find("}") + 1 :]
75 # ----
77 def __init__(self, el: ET.Element, *, default_namespace: Optional[str] = None):
78 """Wrap an existing XML ElementTree Element."""
79 self._node: ET.Element = el
80 self._def_ns = default_namespace
82 @classmethod
83 def parse(cls, path: Union[str, Path], **kwargs) -> XMLProxy:
84 """Parse an XML file into a wrapped ElementTree, preserving comments."""
85 path = path if isinstance(path, Path) else Path(path)
86 return cls(load_xml(path).getroot(), **kwargs)
88 def write(self, path: Union[str, Path], *, header: bool = True, **kwargs):
89 """Write the XML DOM to an UTF-8 encoded file."""
90 path = path if isinstance(path, Path) else Path(path)
91 et = ET.ElementTree(self._node)
92 if self._def_ns and "default_namespace" not in kwargs:
93 kwargs["default_namespace"] = self._def_ns
94 indent(et.getroot())
95 et.write(path, encoding="UTF-8", xml_declaration=header, **kwargs)
97 def __repr__(self):
98 """See `object.__repr__`."""
99 return str(self._node)
101 def __len__(self):
102 """Return number of inner tags inside current XML element.
104 Note that bool(node) thus checks whether an XML node is a leaf in the element tree.
105 """
106 return len(self._node)
108 def __iter__(self):
109 """Iterate the nested elements in-order."""
110 return map(self._wrap, iter(self._node))
112 @property
113 def namespace(self) -> Optional[str]:
114 """Default namespace of this node."""
115 return self._def_ns
117 @property
118 def is_comment(self):
119 """Return whether the current element node is an XML comment."""
120 return not isinstance(self._node.tag, str)
122 @property
123 def tag(self) -> Optional[str]:
124 """Return tag name of this element (unless it is a comment)."""
125 if self.is_comment:
126 return None
127 return self._shortened_key(self._node.tag)
129 @tag.setter
130 def tag(self, val: str):
131 """Set the tag of this element."""
132 if self.is_comment:
133 raise ValueError("Cannot set tag name for comment element!")
134 self._node.tag = self._qualified_key(val)
136 # ---- helpers ----
138 def to_jsonlike(
139 self,
140 *,
141 strip_default_ns: bool = True,
142 keep_root: bool = False,
143 ) -> JSONLike:
144 """Convert XML node to a JSON-like primitive, array or dict (ignoring attributes).
146 Note that all leaf values are strings (i.e. not parsed to bool/int/float etc.).
148 Args:
149 strip_default_ns: Do not qualify keys from the default namespace
150 keep_root: If true, the root tag name will be preserved (`{"root_tag": {...}}`)
152 """
153 if not len(self): # leaf -> assume it's a primitive value
154 return self._node.text or ""
156 dct = {}
157 ccnt = 0
158 for elem in iter(self):
159 raw = elem._node
160 if not isinstance(raw.tag, str):
161 ccnt += 1
162 key = f"__comment_{ccnt}__"
163 else:
164 key = raw.tag if not strip_default_ns else self._shortened_key(raw.tag)
166 curr_val = elem.to_jsonlike(strip_default_ns=strip_default_ns)
167 if key not in dct:
168 dct[key] = curr_val
169 continue
170 val = dct[key]
171 if not isinstance(val, list):
172 dct[key] = [dct[key]]
173 dct[key].append(curr_val)
175 return dct if not keep_root else {self._shortened_key(self._node.tag): dct}
177 @classmethod
178 def _from_jsonlike_primitive(
179 cls, val, *, elem_name: Optional[str] = None, **kwargs
180 ) -> Union[str, XMLProxy]:
181 """Convert a leaf node into a string value (i.e. return inner text).
183 Returns a string (or an XML element, if elem_name is passed).
184 """
185 if val is None:
186 ret = "" # turn None into empty string
187 elif isinstance(val, str):
188 ret = val
189 elif isinstance(val, bool):
190 ret = str(val).lower() # True -> true / False -> false
191 elif isinstance(val, (int, float)):
192 ret = str(val)
193 else:
194 raise TypeError(
195 f"Value of type {type(val)} is not JSON-like primitive: {val}"
196 )
198 if not elem_name:
199 return ret
200 else: # return the value wrapped as an element (needed in from_jsonlike)
201 elem = ET.Element(elem_name)
202 elem.text = ret
203 return cls(elem, **kwargs)
205 @classmethod
206 def from_jsonlike(
207 cls, val: JSONLike, *, root_name: Optional[str] = None, **kwargs: Any
208 ):
209 """Convert a JSON-like primitive, array or dict into an XML element.
211 Note that booleans are serialized as `true`/`false` and None as `null`.
213 Args:
214 val: Value to convert into an XML element.
215 root_name: If `val` is a dict, defines the tag name for the root element.
216 kwargs: Additional arguments for XML element instantiation.
218 """
219 if isinstance(val, list):
220 return list(
221 map(lambda x: cls.from_jsonlike(x, root_name=root_name, **kwargs), val)
222 )
223 if not isinstance(val, dict): # primitive val
224 return cls._from_jsonlike_primitive(val, elem_name=root_name, **kwargs)
226 # now the dict case remains
227 elem = ET.Element(root_name or "root")
228 for k, v in val.items():
229 if k.startswith(
230 "__comment_"
231 ): # special key names are mapped to XML comments
232 elem.append(ET.Comment(v if isinstance(v, str) else str(v)))
234 elif isinstance(v, list):
235 for vv in XMLProxy.from_jsonlike(v, root_name=k, **kwargs):
236 elem.append(vv._node)
237 elif not isinstance(v, dict): # primitive val
238 # FIXME: use better case-splitting for type of function to avoid cast
239 tmp = cast(
240 XMLProxy,
241 XMLProxy._from_jsonlike_primitive(v, elem_name=k, **kwargs),
242 )
243 elem.append(tmp._node)
244 else: # dict
245 elem.append(XMLProxy.from_jsonlike(v, root_name=k)._node)
247 return cls(elem, **kwargs)
249 # ---- dict-like access ----
251 def get(self, key: str, *, as_nodes: bool = False, deep: bool = False):
252 """Get sub-structure(s) of value(s) matching desired XML tag name.
254 * If there are multiple matching elements, will return them all as a list.
255 * If there is a single matching element, will return that element without a list.
257 Args:
258 key: tag name to retrieve
259 as_nodes: If true, will *always* return a list of (zero or more) XML nodes
260 deep: Expand nested XML elements instead of returning them as XML nodes
262 """
263 # NOTE: could allow to retrieve comments when using empty string/none as key?
265 if as_nodes and deep:
266 raise ValueError("as_nodes and deep are mutually exclusive!")
267 if not key:
268 raise ValueError("Key must not be an empty string!")
269 key = self._qualified_key(key)
271 # if not fully qualified + default NS is given, use it for query
272 lst = self._node.findall(key)
273 ns: List[XMLProxy] = list(map(self._wrap, lst))
274 if as_nodes: # return it as a list of xml nodes
275 return ns
276 if not ns: # no element
277 return None
279 ret = ns if not deep else [x.to_jsonlike() for x in ns]
280 if len(ret) == 1:
281 return ret[0] # single element
282 else:
283 return ret
285 def __getitem__(self, key: str):
286 """Acts like `dict.__getitem__`, implemented with `get`."""
287 val = self.get(key)
288 if val is not None:
289 return val
290 else:
291 raise KeyError(key)
293 def __contains__(self, key: str) -> bool:
294 """Acts like `dict.__contains__`, implemented with `get`."""
295 return self.get(key) is not None
297 def __delitem__(self, key: Union[str, XMLProxy]):
298 """Delete a nested XML element with matching key name.
300 Note that **all** XML elements with the given tag name are removed!
302 To prevent this behavior, instead of a string tag name you can provide the
303 exact element to be removed, i.e. if a node `node_a` represents the following XML:
305 ```
306 <a>
307 <b>1</b>
308 <c>2</c>
309 <b>3</b>
310 </a>
311 ```
313 Then we have that:
315 * `del node_a["b"]` removes **both** tags, leaving just the `c` tag.
316 * `del node_a[node_a["a"][1]]` removes just the second tag with the `3`.
317 """
318 if isinstance(key, str):
319 nodes = self.get(key, as_nodes=True)
320 else:
321 nodes = [key] if key._node in self._node else []
323 if not nodes:
324 raise KeyError(key)
326 if self._node.text is not None:
327 self._node.text = ""
328 for child in nodes:
329 self._node.remove(child._node)
331 def _clear(self):
332 """Remove contents of this XML element (e.g. for overwriting in-place)."""
333 self._node.text = ""
334 children = list(iter(self._node)) # need to store, removal invalidates iterator
335 for child in children:
336 self._node.remove(child)
338 def __setitem__(self, key: Union[str, XMLProxy], val: Union[JSONLike, XMLProxy]):
339 """Add or overwrite an inner XML tag.
341 If there is exactly one matching tag, the value is substituted in-place.
342 If the passed value is a list, all list entries are added in their own element.
344 If there are multiple existing matches or target values, then
345 **all** existing elements are removed and the new value(s) are added in
346 new element(s) (i.e. coming after other unrelated existing elements)!
348 To prevent this behavior, instead of a string tag name you can provide the
349 exact element to be overwritten, i.e. if a node `node_a` represents the following XML:
351 ```
352 <a>
353 <b>1</b>
354 <c>2</c>
355 <b>3</b>
356 </a>
357 ```
359 Then we have that:
361 * `node_a["b"] = 5` removes both existing tags and creates a new tag with the passed value(s).
362 * `node_a[node_a["b"][1]] = 5` replaces the `3` in the second tag with the `5`.
364 Note that the passed value must be either an XML element already, or be a pure JSON-like object.
365 """
366 if isinstance(key, str):
367 nodes = self.get(key, as_nodes=True)
368 # delete all existing elements if multiple exist or are passed
369 if len(nodes) > 1 or (len(nodes) and isinstance(val, list)):
370 del self[key]
371 nodes = []
372 # now we can assume there's zero or one suitable target elements
373 if nodes: # if it is one, clear it out
374 nodes[0]._clear()
375 else: # an XMLProxy object was passed as key -> try to use that
376 if isinstance(val, list):
377 raise ValueError(
378 "Cannot overwrite a single element with a list of values!"
379 )
380 # ensure the target node is cleared out and use it as target
381 key._clear()
382 nodes = [key]
383 key = key.tag
385 # ensure key string is qualified with a namespace
386 key_name: str = self._qualified_key(key)
388 # normalize passed value(s) to be list (general case)
389 vals = val if isinstance(val, list) else [val]
391 # ensure there is the required number of target element nodes
392 for _ in range(len(vals) - len(nodes)):
393 nodes.append(self._wrap(ET.SubElement(self._node, key_name)))
395 # normalize values no XML element nodes
396 nvals = []
397 for val in vals:
398 # ensure value is represented as an XML node
399 if isinstance(val, XMLProxy):
400 obj = self._wrap(ET.Element("dummy"))
401 obj._node.append(val._node)
402 else:
403 obj = self.from_jsonlike(val, root_name=key_name)
405 nvals.append(obj)
407 for node, val in zip(nodes, nvals):
408 # transplant node contents into existing element (so it is inserted in-place)
409 node._node.text = val._node.text
410 for child in iter(val):
411 node._node.append(child._node)