Coverage for src/somesy/pom_xml/xmlproxy.py: 94%

190 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-29 07:50 +0000

1"""Wrapper to provide dict-like access to XML via ElementTree.""" 

2 

3from __future__ import annotations 

4 

5import xml.etree.ElementTree as ET 

6from pathlib import Path 

7from typing import Any, List, Optional, Union, cast 

8 

9import defusedxml.ElementTree as DET 

10 

11# shallow type hint mostly for documentation purpose 

12JSONLike = Any 

13 

14 

15def load_xml(path: Path) -> ET.ElementTree: 

16 """Parse an XML file into an ElementTree, preserving comments.""" 

17 path = path if isinstance(path, Path) else Path(path) 

18 parser = DET.XMLParser(target=ET.TreeBuilder(insert_comments=True)) 

19 return DET.parse(path, parser=parser) 

20 

21 

22def indent(elem, level=0): 

23 """Indent the elements of this XML node (i.e. pretty print).""" 

24 i = "\n" + level * " " 

25 if len(elem): 

26 if not elem.text or not elem.text.strip(): 

27 elem.text = i + " " 

28 if not elem.tail or not elem.tail.strip(): 

29 elem.tail = i 

30 for el in elem: 

31 indent(el, level + 1) 

32 if not el.tail or not el.tail.strip(): 

33 el.tail = i 

34 else: 

35 if level and (not elem.tail or not elem.tail.strip()): 

36 elem.tail = i 

37 

38 

39class XMLProxy: 

40 """Class providing dict-like access to edit XML via ElementTree. 

41 

42 Note that this wrapper facade is limited to a restricted (but useful) subset of XML: 

43 * XML attributes are not supported 

44 * DTDs are ignored (arbitrary keys can be queried and added) 

45 * each tag is assumed to EITHER contain text OR more nested tags 

46 * lists are treated atomically (no way to add/remove element from a collection) 

47 

48 The semantics is implemented as follows: 

49 

50 * If there are multiple tags with the same name, a list of XMLProxy nodes is returned 

51 * If a unique tag does have no nested tags, its `text` string value is returned 

52 * Otherwise, the node is returned 

53 """ 

54 

55 def _wrap(self, el: ET.Element) -> XMLProxy: 

56 """Wrap a different element, inheriting the same namespace.""" 

57 return XMLProxy(el, default_namespace=self._def_ns) 

58 

59 def _dump(self): 

60 """Dump XML to stdout (for debugging).""" 

61 ET.dump(self._node) 

62 

63 def _qualified_key(self, key: str): 

64 """If passed key is not qualified, prepends the default namespace (if set).""" 

65 if key[0] == "{" or not self._def_ns: 

66 return key 

67 return "{" + self._def_ns + "}" + key 

68 

69 def _shortened_key(self, key: str): 

70 """Inverse of `_qualified_key` (strips default namespace from element name).""" 

71 if key[0] != "{" or not self._def_ns or key.find(self._def_ns) < 0: 

72 return key 

73 return key[key.find("}") + 1 :] 

74 

75 # ---- 

76 

77 def __init__(self, el: ET.Element, *, default_namespace: Optional[str] = None): 

78 """Wrap an existing XML ElementTree Element.""" 

79 self._node: ET.Element = el 

80 self._def_ns = default_namespace 

81 

82 @classmethod 

83 def parse(cls, path: Union[str, Path], **kwargs) -> XMLProxy: 

84 """Parse an XML file into a wrapped ElementTree, preserving comments.""" 

85 path = path if isinstance(path, Path) else Path(path) 

86 return cls(load_xml(path).getroot(), **kwargs) 

87 

88 def write(self, path: Union[str, Path], *, header: bool = True, **kwargs): 

89 """Write the XML DOM to an UTF-8 encoded file.""" 

90 path = path if isinstance(path, Path) else Path(path) 

91 et = ET.ElementTree(self._node) 

92 if self._def_ns and "default_namespace" not in kwargs: 

93 kwargs["default_namespace"] = self._def_ns 

94 indent(et.getroot()) 

95 et.write(path, encoding="UTF-8", xml_declaration=header, **kwargs) 

96 

97 def __repr__(self): 

98 """See `object.__repr__`.""" 

99 return str(self._node) 

100 

101 def __len__(self): 

102 """Return number of inner tags inside current XML element. 

103 

104 Note that bool(node) thus checks whether an XML node is a leaf in the element tree. 

105 """ 

106 return len(self._node) 

107 

108 def __iter__(self): 

109 """Iterate the nested elements in-order.""" 

110 return map(self._wrap, iter(self._node)) 

111 

112 @property 

113 def namespace(self) -> Optional[str]: 

114 """Default namespace of this node.""" 

115 return self._def_ns 

116 

117 @property 

118 def is_comment(self): 

119 """Return whether the current element node is an XML comment.""" 

120 return not isinstance(self._node.tag, str) 

121 

122 @property 

123 def tag(self) -> Optional[str]: 

124 """Return tag name of this element (unless it is a comment).""" 

125 if self.is_comment: 

126 return None 

127 return self._shortened_key(self._node.tag) 

128 

129 @tag.setter 

130 def tag(self, val: str): 

131 """Set the tag of this element.""" 

132 if self.is_comment: 

133 raise ValueError("Cannot set tag name for comment element!") 

134 self._node.tag = self._qualified_key(val) 

135 

136 # ---- helpers ---- 

137 

138 def to_jsonlike( 

139 self, 

140 *, 

141 strip_default_ns: bool = True, 

142 keep_root: bool = False, 

143 ) -> JSONLike: 

144 """Convert XML node to a JSON-like primitive, array or dict (ignoring attributes). 

145 

146 Note that all leaf values are strings (i.e. not parsed to bool/int/float etc.). 

147 

148 Args: 

149 strip_default_ns: Do not qualify keys from the default namespace 

150 keep_root: If true, the root tag name will be preserved (`{"root_tag": {...}}`) 

151 

152 """ 

153 if not len(self): # leaf -> assume it's a primitive value 

154 return self._node.text or "" 

155 

156 dct = {} 

157 ccnt = 0 

158 for elem in iter(self): 

159 raw = elem._node 

160 if not isinstance(raw.tag, str): 

161 ccnt += 1 

162 key = f"__comment_{ccnt}__" 

163 else: 

164 key = raw.tag if not strip_default_ns else self._shortened_key(raw.tag) 

165 

166 curr_val = elem.to_jsonlike(strip_default_ns=strip_default_ns) 

167 if key not in dct: 

168 dct[key] = curr_val 

169 continue 

170 val = dct[key] 

171 if not isinstance(val, list): 

172 dct[key] = [dct[key]] 

173 dct[key].append(curr_val) 

174 

175 return dct if not keep_root else {self._shortened_key(self._node.tag): dct} 

176 

177 @classmethod 

178 def _from_jsonlike_primitive( 

179 cls, val, *, elem_name: Optional[str] = None, **kwargs 

180 ) -> Union[str, XMLProxy]: 

181 """Convert a leaf node into a string value (i.e. return inner text). 

182 

183 Returns a string (or an XML element, if elem_name is passed). 

184 """ 

185 if val is None: 

186 ret = "" # turn None into empty string 

187 elif isinstance(val, str): 

188 ret = val 

189 elif isinstance(val, bool): 

190 ret = str(val).lower() # True -> true / False -> false 

191 elif isinstance(val, (int, float)): 

192 ret = str(val) 

193 else: 

194 raise TypeError( 

195 f"Value of type {type(val)} is not JSON-like primitive: {val}" 

196 ) 

197 

198 if not elem_name: 

199 return ret 

200 else: # return the value wrapped as an element (needed in from_jsonlike) 

201 elem = ET.Element(elem_name) 

202 elem.text = ret 

203 return cls(elem, **kwargs) 

204 

205 @classmethod 

206 def from_jsonlike( 

207 cls, val: JSONLike, *, root_name: Optional[str] = None, **kwargs: Any 

208 ): 

209 """Convert a JSON-like primitive, array or dict into an XML element. 

210 

211 Note that booleans are serialized as `true`/`false` and None as `null`. 

212 

213 Args: 

214 val: Value to convert into an XML element. 

215 root_name: If `val` is a dict, defines the tag name for the root element. 

216 kwargs: Additional arguments for XML element instantiation. 

217 

218 """ 

219 if isinstance(val, list): 

220 return list( 

221 map(lambda x: cls.from_jsonlike(x, root_name=root_name, **kwargs), val) 

222 ) 

223 if not isinstance(val, dict): # primitive val 

224 return cls._from_jsonlike_primitive(val, elem_name=root_name, **kwargs) 

225 

226 # now the dict case remains 

227 elem = ET.Element(root_name or "root") 

228 for k, v in val.items(): 

229 if k.startswith( 

230 "__comment_" 

231 ): # special key names are mapped to XML comments 

232 elem.append(ET.Comment(v if isinstance(v, str) else str(v))) 

233 

234 elif isinstance(v, list): 

235 for vv in XMLProxy.from_jsonlike(v, root_name=k, **kwargs): 

236 elem.append(vv._node) 

237 elif not isinstance(v, dict): # primitive val 

238 # FIXME: use better case-splitting for type of function to avoid cast 

239 tmp = cast( 

240 XMLProxy, 

241 XMLProxy._from_jsonlike_primitive(v, elem_name=k, **kwargs), 

242 ) 

243 elem.append(tmp._node) 

244 else: # dict 

245 elem.append(XMLProxy.from_jsonlike(v, root_name=k)._node) 

246 

247 return cls(elem, **kwargs) 

248 

249 # ---- dict-like access ---- 

250 

251 def get(self, key: str, *, as_nodes: bool = False, deep: bool = False): 

252 """Get sub-structure(s) of value(s) matching desired XML tag name. 

253 

254 * If there are multiple matching elements, will return them all as a list. 

255 * If there is a single matching element, will return that element without a list. 

256 

257 Args: 

258 key: tag name to retrieve 

259 as_nodes: If true, will *always* return a list of (zero or more) XML nodes 

260 deep: Expand nested XML elements instead of returning them as XML nodes 

261 

262 """ 

263 # NOTE: could allow to retrieve comments when using empty string/none as key? 

264 

265 if as_nodes and deep: 

266 raise ValueError("as_nodes and deep are mutually exclusive!") 

267 if not key: 

268 raise ValueError("Key must not be an empty string!") 

269 key = self._qualified_key(key) 

270 

271 # if not fully qualified + default NS is given, use it for query 

272 lst = self._node.findall(key) 

273 ns: List[XMLProxy] = list(map(self._wrap, lst)) 

274 if as_nodes: # return it as a list of xml nodes 

275 return ns 

276 if not ns: # no element 

277 return None 

278 

279 ret = ns if not deep else [x.to_jsonlike() for x in ns] 

280 if len(ret) == 1: 

281 return ret[0] # single element 

282 else: 

283 return ret 

284 

285 def __getitem__(self, key: str): 

286 """Acts like `dict.__getitem__`, implemented with `get`.""" 

287 val = self.get(key) 

288 if val is not None: 

289 return val 

290 else: 

291 raise KeyError(key) 

292 

293 def __contains__(self, key: str) -> bool: 

294 """Acts like `dict.__contains__`, implemented with `get`.""" 

295 return self.get(key) is not None 

296 

297 def __delitem__(self, key: Union[str, XMLProxy]): 

298 """Delete a nested XML element with matching key name. 

299 

300 Note that **all** XML elements with the given tag name are removed! 

301 

302 To prevent this behavior, instead of a string tag name you can provide the 

303 exact element to be removed, i.e. if a node `node_a` represents the following XML: 

304 

305 ``` 

306 <a> 

307 <b>1</b> 

308 <c>2</c> 

309 <b>3</b> 

310 </a> 

311 ``` 

312 

313 Then we have that: 

314 

315 * `del node_a["b"]` removes **both** tags, leaving just the `c` tag. 

316 * `del node_a[node_a["a"][1]]` removes just the second tag with the `3`. 

317 """ 

318 if isinstance(key, str): 

319 nodes = self.get(key, as_nodes=True) 

320 else: 

321 nodes = [key] if key._node in self._node else [] 

322 

323 if not nodes: 

324 raise KeyError(key) 

325 

326 if self._node.text is not None: 

327 self._node.text = "" 

328 for child in nodes: 

329 self._node.remove(child._node) 

330 

331 def _clear(self): 

332 """Remove contents of this XML element (e.g. for overwriting in-place).""" 

333 self._node.text = "" 

334 children = list(iter(self._node)) # need to store, removal invalidates iterator 

335 for child in children: 

336 self._node.remove(child) 

337 

338 def __setitem__(self, key: Union[str, XMLProxy], val: Union[JSONLike, XMLProxy]): 

339 """Add or overwrite an inner XML tag. 

340 

341 If there is exactly one matching tag, the value is substituted in-place. 

342 If the passed value is a list, all list entries are added in their own element. 

343 

344 If there are multiple existing matches or target values, then 

345 **all** existing elements are removed and the new value(s) are added in 

346 new element(s) (i.e. coming after other unrelated existing elements)! 

347 

348 To prevent this behavior, instead of a string tag name you can provide the 

349 exact element to be overwritten, i.e. if a node `node_a` represents the following XML: 

350 

351 ``` 

352 <a> 

353 <b>1</b> 

354 <c>2</c> 

355 <b>3</b> 

356 </a> 

357 ``` 

358 

359 Then we have that: 

360 

361 * `node_a["b"] = 5` removes both existing tags and creates a new tag with the passed value(s). 

362 * `node_a[node_a["b"][1]] = 5` replaces the `3` in the second tag with the `5`. 

363 

364 Note that the passed value must be either an XML element already, or be a pure JSON-like object. 

365 """ 

366 if isinstance(key, str): 

367 nodes = self.get(key, as_nodes=True) 

368 # delete all existing elements if multiple exist or are passed 

369 if len(nodes) > 1 or (len(nodes) and isinstance(val, list)): 

370 del self[key] 

371 nodes = [] 

372 # now we can assume there's zero or one suitable target elements 

373 if nodes: # if it is one, clear it out 

374 nodes[0]._clear() 

375 else: # an XMLProxy object was passed as key -> try to use that 

376 if isinstance(val, list): 

377 raise ValueError( 

378 "Cannot overwrite a single element with a list of values!" 

379 ) 

380 # ensure the target node is cleared out and use it as target 

381 key._clear() 

382 nodes = [key] 

383 key = key.tag 

384 

385 # ensure key string is qualified with a namespace 

386 key_name: str = self._qualified_key(key) 

387 

388 # normalize passed value(s) to be list (general case) 

389 vals = val if isinstance(val, list) else [val] 

390 

391 # ensure there is the required number of target element nodes 

392 for _ in range(len(vals) - len(nodes)): 

393 nodes.append(self._wrap(ET.SubElement(self._node, key_name))) 

394 

395 # normalize values no XML element nodes 

396 nvals = [] 

397 for val in vals: 

398 # ensure value is represented as an XML node 

399 if isinstance(val, XMLProxy): 

400 obj = self._wrap(ET.Element("dummy")) 

401 obj._node.append(val._node) 

402 else: 

403 obj = self.from_jsonlike(val, root_name=key_name) 

404 

405 nvals.append(obj) 

406 

407 for node, val in zip(nodes, nvals): 

408 # transplant node contents into existing element (so it is inserted in-place) 

409 node._node.text = val._node.text 

410 for child in iter(val): 

411 node._node.append(child._node)