Coverage for src/somesy/pom_xml/xmlproxy.py: 94%

190 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2024-04-30 09:42 +0000

1"""Wrapper to provide dict-like access to XML via ElementTree.""" 

2from __future__ import annotations 

3 

4import xml.etree.ElementTree as ET 

5from pathlib import Path 

6from typing import Any, List, Optional, Union, cast 

7 

8import defusedxml.ElementTree as DET 

9 

10# shallow type hint mostly for documentation purpose 

11JSONLike = Any 

12 

13 

14def load_xml(path: Path) -> ET.ElementTree: 

15 """Parse an XML file into an ElementTree, preserving comments.""" 

16 path = path if isinstance(path, Path) else Path(path) 

17 parser = DET.XMLParser(target=ET.TreeBuilder(insert_comments=True)) 

18 return DET.parse(path, parser=parser) 

19 

20 

21def indent(elem, level=0): 

22 """Indent the elements of this XML node (i.e. pretty print).""" 

23 i = "\n" + level * " " 

24 if len(elem): 

25 if not elem.text or not elem.text.strip(): 

26 elem.text = i + " " 

27 if not elem.tail or not elem.tail.strip(): 

28 elem.tail = i 

29 for el in elem: 

30 indent(el, level + 1) 

31 if not el.tail or not el.tail.strip(): 

32 el.tail = i 

33 else: 

34 if level and (not elem.tail or not elem.tail.strip()): 

35 elem.tail = i 

36 

37 

38class XMLProxy: 

39 """Class providing dict-like access to edit XML via ElementTree. 

40 

41 Note that this wrapper facade is limited to a restricted (but useful) subset of XML: 

42 * XML attributes are not supported 

43 * DTDs are ignored (arbitrary keys can be queried and added) 

44 * each tag is assumed to EITHER contain text OR more nested tags 

45 * lists are treated atomically (no way to add/remove element from a collection) 

46 

47 The semantics is implemented as follows: 

48 

49 * If there are multiple tags with the same name, a list of XMLProxy nodes is returned 

50 * If a unique tag does have no nested tags, its `text` string value is returned 

51 * Otherwise, the node is returned 

52 """ 

53 

54 def _wrap(self, el: ET.Element) -> XMLProxy: 

55 """Wrap a different element, inheriting the same namespace.""" 

56 return XMLProxy(el, default_namespace=self._def_ns) 

57 

58 def _dump(self): 

59 """Dump XML to stdout (for debugging).""" 

60 ET.dump(self._node) 

61 

62 def _qualified_key(self, key: str): 

63 """If passed key is not qualified, prepends the default namespace (if set).""" 

64 if key[0] == "{" or not self._def_ns: 

65 return key 

66 return "{" + self._def_ns + "}" + key 

67 

68 def _shortened_key(self, key: str): 

69 """Inverse of `_qualified_key` (strips default namespace from element name).""" 

70 if key[0] != "{" or not self._def_ns or key.find(self._def_ns) < 0: 

71 return key 

72 return key[key.find("}") + 1 :] 

73 

74 # ---- 

75 

76 def __init__(self, el: ET.Element, *, default_namespace: Optional[str] = None): 

77 """Wrap an existing XML ElementTree Element.""" 

78 self._node: ET.Element = el 

79 self._def_ns = default_namespace 

80 

81 @classmethod 

82 def parse(cls, path: Union[str, Path], **kwargs) -> XMLProxy: 

83 """Parse an XML file into a wrapped ElementTree, preserving comments.""" 

84 path = path if isinstance(path, Path) else Path(path) 

85 return cls(load_xml(path).getroot(), **kwargs) 

86 

87 def write(self, path: Union[str, Path], *, header: bool = True, **kwargs): 

88 """Write the XML DOM to an UTF-8 encoded file.""" 

89 path = path if isinstance(path, Path) else Path(path) 

90 et = ET.ElementTree(self._node) 

91 if self._def_ns and "default_namespace" not in kwargs: 

92 kwargs["default_namespace"] = self._def_ns 

93 indent(et.getroot()) 

94 et.write(path, encoding="UTF-8", xml_declaration=header, **kwargs) 

95 

96 def __repr__(self): 

97 """See `object.__repr__`.""" 

98 return str(self._node) 

99 

100 def __len__(self): 

101 """Return number of inner tags inside current XML element. 

102 

103 Note that bool(node) thus checks whether an XML node is a leaf in the element tree. 

104 """ 

105 return len(self._node) 

106 

107 def __iter__(self): 

108 """Iterate the nested elements in-order.""" 

109 return map(self._wrap, iter(self._node)) 

110 

111 @property 

112 def namespace(self) -> Optional[str]: 

113 """Default namespace of this node.""" 

114 return self._def_ns 

115 

116 @property 

117 def is_comment(self): 

118 """Return whether the current element node is an XML comment.""" 

119 return not isinstance(self._node.tag, str) 

120 

121 @property 

122 def tag(self) -> Optional[str]: 

123 """Return tag name of this element (unless it is a comment).""" 

124 if self.is_comment: 

125 return None 

126 return self._shortened_key(self._node.tag) 

127 

128 @tag.setter 

129 def tag(self, val: str): 

130 """Set the tag of this element.""" 

131 if self.is_comment: 

132 raise ValueError("Cannot set tag name for comment element!") 

133 self._node.tag = self._qualified_key(val) 

134 

135 # ---- helpers ---- 

136 

137 def to_jsonlike( 

138 self, 

139 *, 

140 strip_default_ns: bool = True, 

141 keep_root: bool = False, 

142 ) -> JSONLike: 

143 """Convert XML node to a JSON-like primitive, array or dict (ignoring attributes). 

144 

145 Note that all leaf values are strings (i.e. not parsed to bool/int/float etc.). 

146 

147 Args: 

148 strip_default_ns: Do not qualify keys from the default namespace 

149 keep_root: If true, the root tag name will be preserved (`{"root_tag": {...}}`) 

150 """ 

151 if not len(self): # leaf -> assume it's a primitive value 

152 return self._node.text or "" 

153 

154 dct = {} 

155 ccnt = 0 

156 for elem in iter(self): 

157 raw = elem._node 

158 if not isinstance(raw.tag, str): 

159 ccnt += 1 

160 key = f"__comment_{ccnt}__" 

161 else: 

162 key = raw.tag if not strip_default_ns else self._shortened_key(raw.tag) 

163 

164 curr_val = elem.to_jsonlike(strip_default_ns=strip_default_ns) 

165 if key not in dct: 

166 dct[key] = curr_val 

167 continue 

168 val = dct[key] 

169 if not isinstance(val, list): 

170 dct[key] = [dct[key]] 

171 dct[key].append(curr_val) 

172 

173 return dct if not keep_root else {self._shortened_key(self._node.tag): dct} 

174 

175 @classmethod 

176 def _from_jsonlike_primitive( 

177 cls, val, *, elem_name: Optional[str] = None, **kwargs 

178 ) -> Union[str, XMLProxy]: 

179 """Convert a leaf node into a string value (i.e. return inner text). 

180 

181 Returns a string (or an XML element, if elem_name is passed). 

182 """ 

183 if val is None: 

184 ret = "" # turn None into empty string 

185 elif isinstance(val, str): 

186 ret = val 

187 elif isinstance(val, bool): 

188 ret = str(val).lower() # True -> true / False -> false 

189 elif isinstance(val, (int, float)): 

190 ret = str(val) 

191 else: 

192 raise TypeError( 

193 f"Value of type {type(val)} is not JSON-like primitive: {val}" 

194 ) 

195 

196 if not elem_name: 

197 return ret 

198 else: # return the value wrapped as an element (needed in from_jsonlike) 

199 elem = ET.Element(elem_name) 

200 elem.text = ret 

201 return cls(elem, **kwargs) 

202 

203 @classmethod 

204 def from_jsonlike( 

205 cls, val: JSONLike, *, root_name: Optional[str] = None, **kwargs: Any 

206 ): 

207 """Convert a JSON-like primitive, array or dict into an XML element. 

208 

209 Note that booleans are serialized as `true`/`false` and None as `null`. 

210 

211 Args: 

212 val: Value to convert into an XML element. 

213 root_name: If `val` is a dict, defines the tag name for the root element. 

214 kwargs: Additional arguments for XML element instantiation. 

215 """ 

216 if isinstance(val, list): 

217 return list( 

218 map(lambda x: cls.from_jsonlike(x, root_name=root_name, **kwargs), val) 

219 ) 

220 if not isinstance(val, dict): # primitive val 

221 return cls._from_jsonlike_primitive(val, elem_name=root_name, **kwargs) 

222 

223 # now the dict case remains 

224 elem = ET.Element(root_name or "root") 

225 for k, v in val.items(): 

226 if k.startswith( 

227 "__comment_" 

228 ): # special key names are mapped to XML comments 

229 elem.append(ET.Comment(v if isinstance(v, str) else str(v))) 

230 

231 elif isinstance(v, list): 

232 for vv in XMLProxy.from_jsonlike(v, root_name=k, **kwargs): 

233 elem.append(vv._node) 

234 elif not isinstance(v, dict): # primitive val 

235 # FIXME: use better case-splitting for type of function to avoid cast 

236 tmp = cast( 

237 XMLProxy, 

238 XMLProxy._from_jsonlike_primitive(v, elem_name=k, **kwargs), 

239 ) 

240 elem.append(tmp._node) 

241 else: # dict 

242 elem.append(XMLProxy.from_jsonlike(v, root_name=k)._node) 

243 

244 return cls(elem, **kwargs) 

245 

246 # ---- dict-like access ---- 

247 

248 def get(self, key: str, *, as_nodes: bool = False, deep: bool = False): 

249 """Get sub-structure(s) of value(s) matching desired XML tag name. 

250 

251 * If there are multiple matching elements, will return them all as a list. 

252 * If there is a single matching element, will return that element without a list. 

253 

254 Args: 

255 key: tag name to retrieve 

256 as_nodes: If true, will *always* return a list of (zero or more) XML nodes 

257 deep: Expand nested XML elements instead of returning them as XML nodes 

258 """ 

259 # NOTE: could allow to retrieve comments when using empty string/none as key? 

260 

261 if as_nodes and deep: 

262 raise ValueError("as_nodes and deep are mutually exclusive!") 

263 if not key: 

264 raise ValueError("Key must not be an empty string!") 

265 key = self._qualified_key(key) 

266 

267 # if not fully qualified + default NS is given, use it for query 

268 lst = self._node.findall(key) 

269 ns: List[XMLProxy] = list(map(self._wrap, lst)) 

270 if as_nodes: # return it as a list of xml nodes 

271 return ns 

272 if not ns: # no element 

273 return None 

274 

275 ret = ns if not deep else [x.to_jsonlike() for x in ns] 

276 if len(ret) == 1: 

277 return ret[0] # single element 

278 else: 

279 return ret 

280 

281 def __getitem__(self, key: str): 

282 """Acts like `dict.__getitem__`, implemented with `get`.""" 

283 val = self.get(key) 

284 if val is not None: 

285 return val 

286 else: 

287 raise KeyError(key) 

288 

289 def __contains__(self, key: str) -> bool: 

290 """Acts like `dict.__contains__`, implemented with `get`.""" 

291 return self.get(key) is not None 

292 

293 def __delitem__(self, key: Union[str, XMLProxy]): 

294 """Delete a nested XML element with matching key name. 

295 

296 Note that **all** XML elements with the given tag name are removed! 

297 

298 To prevent this behavior, instead of a string tag name you can provide the 

299 exact element to be removed, i.e. if a node `node_a` represents the following XML: 

300 

301 ``` 

302 <a> 

303 <b>1</b> 

304 <c>2</c> 

305 <b>3</b> 

306 </a> 

307 ``` 

308 

309 Then we have that: 

310 

311 * `del node_a["b"]` removes **both** tags, leaving just the `c` tag. 

312 * `del node_a[node_a["a"][1]]` removes just the second tag with the `3`. 

313 """ 

314 if isinstance(key, str): 

315 nodes = self.get(key, as_nodes=True) 

316 else: 

317 nodes = [key] if key._node in self._node else [] 

318 

319 if not nodes: 

320 raise KeyError(key) 

321 

322 if self._node.text is not None: 

323 self._node.text = "" 

324 for child in nodes: 

325 self._node.remove(child._node) 

326 

327 def _clear(self): 

328 """Remove contents of this XML element (e.g. for overwriting in-place).""" 

329 self._node.text = "" 

330 children = list(iter(self._node)) # need to store, removal invalidates iterator 

331 for child in children: 

332 self._node.remove(child) 

333 

334 def __setitem__(self, key: Union[str, XMLProxy], val: Union[JSONLike, XMLProxy]): 

335 """Add or overwrite an inner XML tag. 

336 

337 If there is exactly one matching tag, the value is substituted in-place. 

338 If the passed value is a list, all list entries are added in their own element. 

339 

340 If there are multiple existing matches or target values, then 

341 **all** existing elements are removed and the new value(s) are added in 

342 new element(s) (i.e. coming after other unrelated existing elements)! 

343 

344 To prevent this behavior, instead of a string tag name you can provide the 

345 exact element to be overwritten, i.e. if a node `node_a` represents the following XML: 

346 

347 ``` 

348 <a> 

349 <b>1</b> 

350 <c>2</c> 

351 <b>3</b> 

352 </a> 

353 ``` 

354 

355 Then we have that: 

356 

357 * `node_a["b"] = 5` removes both existing tags and creates a new tag with the passed value(s). 

358 * `node_a[node_a["b"][1]] = 5` replaces the `3` in the second tag with the `5`. 

359 

360 Note that the passed value must be either an XML element already, or be a pure JSON-like object. 

361 """ 

362 if isinstance(key, str): 

363 nodes = self.get(key, as_nodes=True) 

364 # delete all existing elements if multiple exist or are passed 

365 if len(nodes) > 1 or (len(nodes) and isinstance(val, list)): 

366 del self[key] 

367 nodes = [] 

368 # now we can assume there's zero or one suitable target elements 

369 if nodes: # if it is one, clear it out 

370 nodes[0]._clear() 

371 else: # an XMLProxy object was passed as key -> try to use that 

372 if isinstance(val, list): 

373 raise ValueError( 

374 "Cannot overwrite a single element with a list of values!" 

375 ) 

376 # ensure the target node is cleared out and use it as target 

377 key._clear() 

378 nodes = [key] 

379 key = key.tag 

380 

381 # ensure key string is qualified with a namespace 

382 key_name: str = self._qualified_key(key) 

383 

384 # normalize passed value(s) to be list (general case) 

385 vals = val if isinstance(val, list) else [val] 

386 

387 # ensure there is the required number of target element nodes 

388 for _ in range(len(vals) - len(nodes)): 

389 nodes.append(self._wrap(ET.SubElement(self._node, key_name))) 

390 

391 # normalize values no XML element nodes 

392 nvals = [] 

393 for val in vals: 

394 # ensure value is represented as an XML node 

395 if isinstance(val, XMLProxy): 

396 obj = self._wrap(ET.Element("dummy")) 

397 obj._node.append(val._node) 

398 else: 

399 obj = self.from_jsonlike(val, root_name=key_name) 

400 

401 nvals.append(obj) 

402 

403 for node, val in zip(nodes, nvals): 

404 # transplant node contents into existing element (so it is inserted in-place) 

405 node._node.text = val._node.text 

406 for child in iter(val): 

407 node._node.append(child._node)