Coverage for gws-app/gws/lib/xmlx/parser.py: 93%
106 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-16 23:09 +0200
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-16 23:09 +0200
1"""XML parser."""
3from typing import Optional, cast
5import re
6import xml.etree.ElementTree
8import gws
10from . import error, element, namespace
13def from_path(path: str, opts: Optional[gws.XmlOptions] = None) -> gws.XmlElement:
14 """Creates an ``XmlElement`` object from a .xml file.
16 Args:
17 path: Path to the .xml file.
18 opts: XML options.
19 """
21 with open(path, 'rb') as fp:
22 inp = fp.read()
23 return _parse(inp, opts)
26def from_string(inp: str | bytes, opts: Optional[gws.XmlOptions] = None) -> gws.XmlElement:
27 """Creates an ``XmlElement`` from a string or bytes.
29 Args:
30 inp: .xml file as a string or bytes.
31 opts: XML options.
32 """
34 return _parse(inp, opts)
37##
40def _parse(inp, opts: Optional[gws.XmlOptions] = None) -> gws.XmlElement:
41 inp2 = _decode_input(inp)
42 parser = xml.etree.ElementTree.XMLParser(target=_ParserTarget(opts or gws.XmlOptions()))
43 try:
44 parser.feed(inp2)
45 return cast(gws.XmlElement, parser.close())
46 except xml.etree.ElementTree.ParseError as exc:
47 raise error.ParseError(exc.args[0]) from exc
50class _ParserTarget:
51 def __init__(self, opts: gws.XmlOptions):
52 self.stack = []
53 self.root = None
54 self.buf = []
55 self.opts = opts
57 def convert_name(self, s: str) -> str:
58 xmlns, uri, pname = namespace.split_name(s)
59 pname = pname.lower() if self.opts.caseInsensitive else pname
60 if self.opts.removeNamespaces:
61 return pname
62 if not xmlns and not uri:
63 return pname
64 if uri:
65 return '{' + uri + '}' + pname
66 return pname
68 def make(self, tag: str, attrib: dict) -> gws.XmlElement:
69 attrib2 = {}
71 if attrib:
72 for name, val in attrib.items():
73 attrib2[self.convert_name(name)] = val
75 el = element.XmlElement(self.convert_name(tag), attrib2)
76 return el
78 def flush(self):
79 if not self.buf:
80 return
82 text = ''.join(self.buf)
83 self.buf = []
85 if self.opts.compactWhitespace:
86 text = ' '.join(text.strip().split())
88 if text:
89 top = self.stack[-1]
90 if len(top) > 0:
91 top[-1].tail = text
92 else:
93 top.text = text
95 def start(self, tag: str, attrib: dict):
96 self.flush()
97 el = self.make(tag, attrib)
98 if self.stack:
99 self.stack[-1].append(el)
100 else:
101 self.root = el
102 self.stack.append(el)
104 def end(self, tag):
105 self.flush()
106 self.stack.pop()
108 def data(self, data):
109 self.buf.append(data)
111 def close(self):
112 return self.root
115def _decode_input(inp) -> str:
116 # the problem is, we can receive a document
117 # that is declared ISO-8859-1, but actually is UTF and vice versa.
118 # therefore, don't let expat do the decoding, always give it a `str`
119 # and remove the xml decl with the (possibly incorrect) encoding
121 if isinstance(inp, bytes):
122 return _decode_bytes_input(inp)
123 if isinstance(inp, str):
124 return _decode_str_input(inp)
125 raise error.ParseError(f'invalid input type {type(inp)}')
128def _decode_bytes_input(inp: bytes) -> str:
129 inp = inp.strip()
131 encodings = []
133 if inp.startswith(b'<?xml'):
134 try:
135 end = inp.index(b'?>')
136 except ValueError:
137 raise error.ParseError('invalid XML declaration')
139 head = inp[:end].decode('ascii').lower()
140 m = re.search(r'encoding\s*=\s*(\S+)', head)
141 if m:
142 encodings.append(m.group(1).strip('\'"'))
143 inp = inp[end + 2 :]
145 # try the declared encoding, if any, then utf8, then latin
147 if 'utf-8' not in encodings:
148 encodings.append('utf-8')
149 if 'iso-8859-1' not in encodings:
150 encodings.append('iso-8859-1')
152 for enc in encodings:
153 try:
154 return inp.decode(encoding=enc, errors='strict')
155 except (LookupError, UnicodeDecodeError):
156 pass
158 raise error.ParseError(f'invalid document encoding, tried {",".join(encodings)}')
161def _decode_str_input(inp: str) -> str:
162 inp = inp.strip()
164 if inp.startswith('<?xml'):
165 try:
166 end = inp.index('?>')
167 except ValueError:
168 raise error.ParseError('invalid XML declaration')
169 return inp[end + 2 :]
171 return inp