1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 """
20 A simple, validating, SAX-based XML parser.
21
22 Since it is simple, it has some limitations:
23
24 - It can not parse attributes
25 - It can not parse arbitrary nested structures
26 - It can only parse text in leaf nodes: in other words, this piece of XML
27 is not possible to parse: <a>some text <b>here</b> and there</a>
28
29 Here's an example how to parse a simple XML document using this module.
30
31 First we create a file-like object containing the XML data (any file-like
32 object is fine, but we create a StringIO for the purpose of making a working
33 example):
34
35 >>> from StringIO import StringIO
36
37 >>> xml_stream = StringIO('''
38 ... <db>
39 ... <person>
40 ... <name>Rickard</name>
41 ... </person>
42 ... <person>
43 ... <name>James</name>
44 ... <age>38</age>
45 ... </person>
46 ... </db>
47 ... ''')
48
49 Then we define two parser functions that we later associate with Tag objects.
50 Parse functions are called when the end tag has been read. The first argument
51 to a parse function is the text that the tag contains. It will be empty for all
52 tags except leaf tags. The second argument is a dictionary that can be used to
53 store temporary variables. This dictionary is passed to all parse functions,
54 providing a way to share information between parse functions.
55
56 >>> def parse_name(text, tmp_dict):
57 ... tmp_dict["tmp_name"] = text
58
59 >>> def parse_person(text, tmp_dict):
60 ... # text is empty here since person is not a leaf tag
61 ... name = tmp_dict.pop("tmp_name")
62 ... age = tmp_dict.pop("tmp_age", None)
63 ... print("Found %s in db." % name)
64 ... if age is not None:
65 ... print("%s is %s years old." % (name, age))
66
67 Next we define the structure of the XML document that we are going to parse by
68 creating Tag objects. The first argument is the name of the tag, the second
69 specifies how many times it can occur inside its parent (should be one of
70 SINGLE, OPTIONAL, or ANY), the third argument is the parse function to be used
71 for this tag (can be None if no parsing is needed), and the fourth argument is
72 a list of child tags.
73
74 >>> root_tag = Tag("db", SINGLE, None, [
75 ... Tag("person", ANY, parse_person, [
76 ... Tag("name", SINGLE, parse_name),
77 ... Tag("age", OPTIONAL, parse_fn_store("tmp_age")),
78 ... ]),
79 ... ])
80
81 The parse_fn_store function returns a parser function that works exactly like
82 parse_name: it takes the text of the tag and stores it in the dictionary with
83 the given key (tmp_age in this case).
84
85 The last step is to call the parse function with the stream, the tag
86 configuration, and a dictionary. The dictionary can be populated with values
87 before parsing starts if needed.
88
89 >>> parse(xml_stream, root_tag, {})
90 Found Rickard in db.
91 Found James in db.
92 James is 38 years old.
93
94 The parse function will raise a ValidationError if the XML is not valid and a
95 SAXException the if the XML is not well-formed.
96 """
97
98
99 from xml.sax import parse as sax_parse
100 import sys
101 import xml.sax.handler
102
103
104
105 SINGLE = 1
106 OPTIONAL = 2
107 ANY = 3
108
109
111 """Raised when parsed xml document does not follow the schema."""
112 pass
113
114
116 """
117 Represents a tag in an xml document.
118
119 Used to define structure of an xml document and define parser functions for
120 individual parts of an xml document.
121
122 Parser functions are called when the end tag has been read.
123
124 See SaxHandler class defined below to see how this class is used.
125 """
126
127 - def __init__(self, name, occurrence_rule, parse_fn, child_tags=[]):
128 self.name = name
129 self.occurrence_rule = occurrence_rule
130 self.parse_fn = parse_fn
131 self.child_tags = []
132 self.add_child_tags(child_tags)
133 self.parent = None
134
135 self.occurrences = 0
136 self.next_possible_child_pos = 0
137 self.start_read = False
138
142
144 tag.parent = self
145 self.child_tags.append(tag)
146
148 return self.occurrences > 0 or self.occurrence_rule in (OPTIONAL, ANY)
149
151 return self.occurrences == 0 or self.occurrence_rule == ANY
152
154 if name == self.name:
155 if self.start_read == True:
156
157 raise ValidationError("Did not expect <%s>." % name)
158 else:
159 self.start_read = True
160 return self
161 elif self.start_read == True:
162 next_child = self._find_next_child(name)
163 return next_child.handle_start_tag(name, tmp_dict)
164 else:
165 raise ValidationError("Expected <%s> but got <%s>."
166 % (self.name, name))
167
169 self._ensure_end_tag_valid(name, text)
170 if self.parse_fn is not None:
171 self.parse_fn(text, tmp_dict)
172 self._ensure_all_children_read()
173 self._reset_parse_data()
174 self.occurrences += 1
175 return self.parent
176
185
187 num_child_tags = len(self.child_tags)
188 while self.next_possible_child_pos < num_child_tags:
189 child = self.child_tags[self.next_possible_child_pos]
190 if not child.read_enough_times():
191 raise ValidationError("<%s> not read enough times."
192 % child.name)
193 self.next_possible_child_pos += 1
194
196 for child_tag in self.child_tags:
197 child_tag.occurrences = 0
198 self.next_possible_child_pos = 0
199 self.start_read = False
200
202 num_child_tags = len(self.child_tags)
203 while self.next_possible_child_pos < num_child_tags:
204 child = self.child_tags[self.next_possible_child_pos]
205 if child.name == name:
206 if child.can_read_more():
207 return child
208 else:
209 break
210 else:
211 if child.read_enough_times():
212 self.next_possible_child_pos += 1
213 else:
214 break
215 raise ValidationError("Did not expect <%s>." % name)
216
217
219
220 - def __init__(self, root_tag, tmp_dict):
221 self.tag_to_parse = root_tag
222 self.tmp_dict = tmp_dict
223 self.text = ""
224
226 """
227 Called when a start tag has been read.
228 """
229 if attrs.getLength() > 0:
230 raise ValidationError("Did not expect attributes on <%s>." % name)
231 if self.text.strip():
232 raise ValidationError("Did not expect text but got '%s'."
233 % self.text)
234 self.tag_to_parse = self.tag_to_parse.handle_start_tag(name,
235 self.tmp_dict)
236 self.text = ""
237
239 """
240 Called when an end tag (and everything between the start and end tag)
241 has been read.
242 """
243 self.tag_to_parse = self.tag_to_parse.handle_end_tag(name, self.text,
244 self.tmp_dict)
245 self.text = ""
246
249
250
251 -def parse(xml, schema, tmp_dict):
252 """
253 xml should be a filename or a file-like object containing xml data.
254
255 schema should be a Tag object defining the structure of the xml document.
256
257 tmp_dict is used by parser functions in Tag objects to share data. It can
258 be pre-populated with values.
259 """
260 if isinstance(xml, unicode):
261
262
263 xml = xml.encode(sys.getfilesystemencoding())
264 sax_parse(xml, SaxHandler(schema, tmp_dict))
265
266
268 def fn(text, tmp_dict):
269 tmp_dict[store_key] = text
270 return fn
271