Package Gnumed :: Package timelinelib :: Package xml :: Module parser
[frames] | no frames]

Source Code for Module Gnumed.timelinelib.xml.parser

  1  # Copyright (C) 2009, 2010, 2011  Rickard Lindberg, Roger Lindberg 
  2  # 
  3  # This file is part of Timeline. 
  4  # 
  5  # Timeline is free software: you can redistribute it and/or modify 
  6  # it under the terms of the GNU General Public License as published by 
  7  # the Free Software Foundation, either version 3 of the License, or 
  8  # (at your option) any later version. 
  9  # 
 10  # Timeline is distributed in the hope that it will be useful, 
 11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 13  # GNU General Public License for more details. 
 14  # 
 15  # You should have received a copy of the GNU General Public License 
 16  # along with Timeline.  If not, see <http://www.gnu.org/licenses/>. 
 17   
 18   
 19  """ 
 20  A simple, validating, SAX-based XML parser. 
 21   
 22  Since it is simple, it has some limitations: 
 23   
 24      - It can not parse attributes 
 25      - It can not parse arbitrary nested structures 
 26      - It can only parse text in leaf nodes: in other words, this piece of XML 
 27        is not possible to parse: <a>some text <b>here</b> and there</a> 
 28   
 29  Here's an example how to parse a simple XML document using this module. 
 30   
 31  First we create a file-like object containing the XML data (any file-like 
 32  object is fine, but we create a StringIO for the purpose of making a working 
 33  example): 
 34   
 35      >>> from StringIO import StringIO 
 36   
 37      >>> xml_stream = StringIO(''' 
 38      ... <db> 
 39      ...     <person> 
 40      ...         <name>Rickard</name> 
 41      ...     </person> 
 42      ...     <person> 
 43      ...         <name>James</name> 
 44      ...         <age>38</age> 
 45      ...     </person> 
 46      ... </db> 
 47      ... ''') 
 48   
 49  Then we define two parser functions that we later associate with Tag objects. 
 50  Parse functions are called when the end tag has been read. The first argument 
 51  to a parse function is the text that the tag contains. It will be empty for all 
 52  tags except leaf tags. The second argument is a dictionary that can be used to 
 53  store temporary variables. This dictionary is passed to all parse functions, 
 54  providing a way to share information between parse functions. 
 55   
 56      >>> def parse_name(text, tmp_dict): 
 57      ...     tmp_dict["tmp_name"] = text 
 58   
 59      >>> def parse_person(text, tmp_dict): 
 60      ...     # text is empty here since person is not a leaf tag 
 61      ...     name = tmp_dict.pop("tmp_name") 
 62      ...     age = tmp_dict.pop("tmp_age", None) 
 63      ...     print("Found %s in db." % name) 
 64      ...     if age is not None: 
 65      ...         print("%s is %s years old." % (name, age)) 
 66   
 67  Next we define the structure of the XML document that we are going to parse by 
 68  creating Tag objects. The first argument is the name of the tag, the second 
 69  specifies how many times it can occur inside its parent (should be one of 
 70  SINGLE, OPTIONAL, or ANY), the third argument is the parse function to be used 
 71  for this tag (can be None if no parsing is needed), and the fourth argument is 
 72  a list of child tags. 
 73   
 74      >>> root_tag = Tag("db", SINGLE, None, [ 
 75      ...     Tag("person", ANY, parse_person, [ 
 76      ...         Tag("name", SINGLE, parse_name), 
 77      ...         Tag("age", OPTIONAL, parse_fn_store("tmp_age")), 
 78      ...     ]), 
 79      ... ]) 
 80   
 81  The parse_fn_store function returns a parser function that works exactly like 
 82  parse_name: it takes the text of the tag and stores it in the dictionary with 
 83  the given key (tmp_age in this case). 
 84   
 85  The last step is to call the parse function with the stream, the tag 
 86  configuration, and a dictionary. The dictionary can be populated with values 
 87  before parsing starts if needed. 
 88   
 89      >>> parse(xml_stream, root_tag, {}) 
 90      Found Rickard in db. 
 91      Found James in db. 
 92      James is 38 years old. 
 93   
 94  The parse function will raise a ValidationError if the XML is not valid and a 
 95  SAXException the if the XML is not well-formed. 
 96  """ 
 97   
 98   
 99  from xml.sax import parse as sax_parse 
100  import sys 
101  import xml.sax.handler 
102   
103   
104  # Occurrence rules for tags 
105  SINGLE = 1 
106  OPTIONAL = 2 
107  ANY = 3 
108   
109   
110 -class ValidationError(Exception):
111 """Raised when parsed xml document does not follow the schema.""" 112 pass
113 114
115 -class Tag(object):
116 """ 117 Represents a tag in an xml document. 118 119 Used to define structure of an xml document and define parser functions for 120 individual parts of an xml document. 121 122 Parser functions are called when the end tag has been read. 123 124 See SaxHandler class defined below to see how this class is used. 125 """ 126
127 - def __init__(self, name, occurrence_rule, parse_fn, child_tags=[]):
128 self.name = name 129 self.occurrence_rule = occurrence_rule 130 self.parse_fn = parse_fn 131 self.child_tags = [] 132 self.add_child_tags(child_tags) 133 self.parent = None 134 # Variables defining state 135 self.occurrences = 0 136 self.next_possible_child_pos = 0 137 self.start_read = False
138
139 - def add_child_tags(self, tags):
140 for tag in tags: 141 self.add_child_tag(tag)
142
143 - def add_child_tag(self, tag):
144 tag.parent = self 145 self.child_tags.append(tag)
146
147 - def read_enough_times(self):
148 return self.occurrences > 0 or self.occurrence_rule in (OPTIONAL, ANY)
149
150 - def can_read_more(self):
151 return self.occurrences == 0 or self.occurrence_rule == ANY
152
153 - def handle_start_tag(self, name, tmp_dict):
154 if name == self.name: 155 if self.start_read == True: 156 # Nested tag 157 raise ValidationError("Did not expect <%s>." % name) 158 else: 159 self.start_read = True 160 return self 161 elif self.start_read == True: 162 next_child = self._find_next_child(name) 163 return next_child.handle_start_tag(name, tmp_dict) 164 else: 165 raise ValidationError("Expected <%s> but got <%s>." 166 % (self.name, name))
167
168 - def handle_end_tag(self, name, text, tmp_dict):
169 self._ensure_end_tag_valid(name, text) 170 if self.parse_fn is not None: 171 self.parse_fn(text, tmp_dict) 172 self._ensure_all_children_read() 173 self._reset_parse_data() 174 self.occurrences += 1 175 return self.parent
176
177 - def _ensure_end_tag_valid(self, name, text):
178 if name != self.name: 179 raise ValidationError("Expected </%s> but got </%s>." 180 % (self.name, name)) 181 if self.child_tags: 182 if text.strip(): 183 raise ValidationError("Did not expect text but got '%s'." 184 % text)
185
187 num_child_tags = len(self.child_tags) 188 while self.next_possible_child_pos < num_child_tags: 189 child = self.child_tags[self.next_possible_child_pos] 190 if not child.read_enough_times(): 191 raise ValidationError("<%s> not read enough times." 192 % child.name) 193 self.next_possible_child_pos += 1
194
195 - def _reset_parse_data(self):
196 for child_tag in self.child_tags: 197 child_tag.occurrences = 0 198 self.next_possible_child_pos = 0 199 self.start_read = False
200
201 - def _find_next_child(self, name):
202 num_child_tags = len(self.child_tags) 203 while self.next_possible_child_pos < num_child_tags: 204 child = self.child_tags[self.next_possible_child_pos] 205 if child.name == name: 206 if child.can_read_more(): 207 return child 208 else: 209 break 210 else: 211 if child.read_enough_times(): 212 self.next_possible_child_pos += 1 213 else: 214 break 215 raise ValidationError("Did not expect <%s>." % name)
216 217
218 -class SaxHandler(xml.sax.handler.ContentHandler):
219
220 - def __init__(self, root_tag, tmp_dict):
221 self.tag_to_parse = root_tag 222 self.tmp_dict = tmp_dict 223 self.text = ""
224
225 - def startElement(self, name, attrs):
226 """ 227 Called when a start tag has been read. 228 """ 229 if attrs.getLength() > 0: 230 raise ValidationError("Did not expect attributes on <%s>." % name) 231 if self.text.strip(): 232 raise ValidationError("Did not expect text but got '%s'." 233 % self.text) 234 self.tag_to_parse = self.tag_to_parse.handle_start_tag(name, 235 self.tmp_dict) 236 self.text = ""
237
238 - def endElement(self, name):
239 """ 240 Called when an end tag (and everything between the start and end tag) 241 has been read. 242 """ 243 self.tag_to_parse = self.tag_to_parse.handle_end_tag(name, self.text, 244 self.tmp_dict) 245 self.text = ""
246
247 - def characters(self, content):
248 self.text += content
249 250
251 -def parse(xml, schema, tmp_dict):
252 """ 253 xml should be a filename or a file-like object containing xml data. 254 255 schema should be a Tag object defining the structure of the xml document. 256 257 tmp_dict is used by parser functions in Tag objects to share data. It can 258 be pre-populated with values. 259 """ 260 if isinstance(xml, unicode): 261 # Workaround for "Sax parser crashes if given unicode file name" bug: 262 # http://bugs.python.org/issue11159 263 xml = xml.encode(sys.getfilesystemencoding()) 264 sax_parse(xml, SaxHandler(schema, tmp_dict))
265 266
267 -def parse_fn_store(store_key):
268 def fn(text, tmp_dict): 269 tmp_dict[store_key] = text
270 return fn 271