Coverage for gws-app/gws/lib/xmlx/parser.py: 93%

106 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-16 23:09 +0200

1"""XML parser.""" 

2 

3from typing import Optional, cast 

4 

5import re 

6import xml.etree.ElementTree 

7 

8import gws 

9 

10from . import error, element, namespace 

11 

12 

13def from_path(path: str, opts: Optional[gws.XmlOptions] = None) -> gws.XmlElement: 

14 """Creates an ``XmlElement`` object from a .xml file. 

15 

16 Args: 

17 path: Path to the .xml file. 

18 opts: XML options. 

19 """ 

20 

21 with open(path, 'rb') as fp: 

22 inp = fp.read() 

23 return _parse(inp, opts) 

24 

25 

26def from_string(inp: str | bytes, opts: Optional[gws.XmlOptions] = None) -> gws.XmlElement: 

27 """Creates an ``XmlElement`` from a string or bytes. 

28 

29 Args: 

30 inp: .xml file as a string or bytes. 

31 opts: XML options. 

32 """ 

33 

34 return _parse(inp, opts) 

35 

36 

37## 

38 

39 

40def _parse(inp, opts: Optional[gws.XmlOptions] = None) -> gws.XmlElement: 

41 inp2 = _decode_input(inp) 

42 parser = xml.etree.ElementTree.XMLParser(target=_ParserTarget(opts or gws.XmlOptions())) 

43 try: 

44 parser.feed(inp2) 

45 return cast(gws.XmlElement, parser.close()) 

46 except xml.etree.ElementTree.ParseError as exc: 

47 raise error.ParseError(exc.args[0]) from exc 

48 

49 

50class _ParserTarget: 

51 def __init__(self, opts: gws.XmlOptions): 

52 self.stack = [] 

53 self.root = None 

54 self.buf = [] 

55 self.opts = opts 

56 

57 def convert_name(self, s: str) -> str: 

58 xmlns, uri, pname = namespace.split_name(s) 

59 pname = pname.lower() if self.opts.caseInsensitive else pname 

60 if self.opts.removeNamespaces: 

61 return pname 

62 if not xmlns and not uri: 

63 return pname 

64 if uri: 

65 return '{' + uri + '}' + pname 

66 return pname 

67 

68 def make(self, tag: str, attrib: dict) -> gws.XmlElement: 

69 attrib2 = {} 

70 

71 if attrib: 

72 for name, val in attrib.items(): 

73 attrib2[self.convert_name(name)] = val 

74 

75 el = element.XmlElement(self.convert_name(tag), attrib2) 

76 return el 

77 

78 def flush(self): 

79 if not self.buf: 

80 return 

81 

82 text = ''.join(self.buf) 

83 self.buf = [] 

84 

85 if self.opts.compactWhitespace: 

86 text = ' '.join(text.strip().split()) 

87 

88 if text: 

89 top = self.stack[-1] 

90 if len(top) > 0: 

91 top[-1].tail = text 

92 else: 

93 top.text = text 

94 

95 def start(self, tag: str, attrib: dict): 

96 self.flush() 

97 el = self.make(tag, attrib) 

98 if self.stack: 

99 self.stack[-1].append(el) 

100 else: 

101 self.root = el 

102 self.stack.append(el) 

103 

104 def end(self, tag): 

105 self.flush() 

106 self.stack.pop() 

107 

108 def data(self, data): 

109 self.buf.append(data) 

110 

111 def close(self): 

112 return self.root 

113 

114 

115def _decode_input(inp) -> str: 

116 # the problem is, we can receive a document 

117 # that is declared ISO-8859-1, but actually is UTF and vice versa. 

118 # therefore, don't let expat do the decoding, always give it a `str` 

119 # and remove the xml decl with the (possibly incorrect) encoding 

120 

121 if isinstance(inp, bytes): 

122 return _decode_bytes_input(inp) 

123 if isinstance(inp, str): 

124 return _decode_str_input(inp) 

125 raise error.ParseError(f'invalid input type {type(inp)}') 

126 

127 

128def _decode_bytes_input(inp: bytes) -> str: 

129 inp = inp.strip() 

130 

131 encodings = [] 

132 

133 if inp.startswith(b'<?xml'): 

134 try: 

135 end = inp.index(b'?>') 

136 except ValueError: 

137 raise error.ParseError('invalid XML declaration') 

138 

139 head = inp[:end].decode('ascii').lower() 

140 m = re.search(r'encoding\s*=\s*(\S+)', head) 

141 if m: 

142 encodings.append(m.group(1).strip('\'"')) 

143 inp = inp[end + 2 :] 

144 

145 # try the declared encoding, if any, then utf8, then latin 

146 

147 if 'utf-8' not in encodings: 

148 encodings.append('utf-8') 

149 if 'iso-8859-1' not in encodings: 

150 encodings.append('iso-8859-1') 

151 

152 for enc in encodings: 

153 try: 

154 return inp.decode(encoding=enc, errors='strict') 

155 except (LookupError, UnicodeDecodeError): 

156 pass 

157 

158 raise error.ParseError(f'invalid document encoding, tried {",".join(encodings)}') 

159 

160 

161def _decode_str_input(inp: str) -> str: 

162 inp = inp.strip() 

163 

164 if inp.startswith('<?xml'): 

165 try: 

166 end = inp.index('?>') 

167 except ValueError: 

168 raise error.ParseError('invalid XML declaration') 

169 return inp[end + 2 :] 

170 

171 return inp