1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42 """
43 Provides general XML-related functionality.
44
45 What I'm trying to do here is abstract much of the functionality that directly
46 accesses the DOM tree. This is not so much to "protect" the other code from
47 the DOM, but to standardize the way it's used. It will also help extension
48 authors write code that easily looks more like the rest of Cedar Backup.
49
50 @sort: createInputDom, createOutputDom, serializeDom, isElement, readChildren,
51 readFirstChild, readStringList, readString, readInteger, readBoolean,
52 addContainerNode, addStringNode, addIntegerNode, addBooleanNode,
53 TRUE_BOOLEAN_VALUES, FALSE_BOOLEAN_VALUES, VALID_BOOLEAN_VALUES
54
55 @var TRUE_BOOLEAN_VALUES: List of boolean values in XML representing C{True}.
56 @var FALSE_BOOLEAN_VALUES: List of boolean values in XML representing C{False}.
57 @var VALID_BOOLEAN_VALUES: List of valid boolean values in XML.
58
59 @author: Kenneth J. Pronovici <pronovic@ieee.org>
60 """
61
62
63
64
65
66
67
68 import sys
69 import re
70 import logging
71 import codecs
72 from types import UnicodeType
73 from StringIO import StringIO
74
75
76 from xml.parsers.expat import ExpatError
77 from xml.dom.minidom import Node
78 from xml.dom.minidom import getDOMImplementation
79 from xml.dom.minidom import parseString
80
81
82
83
84
85
86 logger = logging.getLogger("CedarBackup2.log.xml")
87
88 TRUE_BOOLEAN_VALUES = [ "Y", "y", ]
89 FALSE_BOOLEAN_VALUES = [ "N", "n", ]
90 VALID_BOOLEAN_VALUES = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES
91
92
93
94
95
96
110
112 """
113 Creates a DOM tree used for writing an XML document.
114 @param name: Base name of the document (root node name).
115 @return: Tuple (xmlDom, parentNode) for the new document
116 """
117 impl = getDOMImplementation()
118 xmlDom = impl.createDocument(None, name, None)
119 return (xmlDom, xmlDom.documentElement)
120
121
122
123
124
125
127 """
128 Returns True or False depending on whether the XML node is an element node.
129 """
130 return node.nodeType == Node.ELEMENT_NODE
131
133 """
134 Returns a list of nodes with a given name immediately beneath the
135 parent.
136
137 By "immediately beneath" the parent, we mean from among nodes that are
138 direct children of the passed-in parent node.
139
140 Underneath, we use the Python C{getElementsByTagName} method, which is
141 pretty cool, but which (surprisingly?) returns a list of all children
142 with a given name below the parent, at any level. We just prune that
143 list to include only children whose C{parentNode} matches the passed-in
144 parent.
145
146 @param parent: Parent node to search beneath.
147 @param name: Name of nodes to search for.
148
149 @return: List of child nodes with correct parent, or an empty list if
150 no matching nodes are found.
151 """
152 lst = []
153 if parent is not None:
154 result = parent.getElementsByTagName(name)
155 for entry in result:
156 if entry.parentNode is parent:
157 lst.append(entry)
158 return lst
159
161 """
162 Returns the first child with a given name immediately beneath the parent.
163
164 By "immediately beneath" the parent, we mean from among nodes that are
165 direct children of the passed-in parent node.
166
167 @param parent: Parent node to search beneath.
168 @param name: Name of node to search for.
169
170 @return: First properly-named child of parent, or C{None} if no matching nodes are found.
171 """
172 result = readChildren(parent, name)
173 if result is None or result == []:
174 return None
175 return result[0]
176
178 """
179 Returns a list of the string contents associated with nodes with a given
180 name immediately beneath the parent.
181
182 By "immediately beneath" the parent, we mean from among nodes that are
183 direct children of the passed-in parent node.
184
185 First, we find all of the nodes using L{readChildren}, and then we
186 retrieve the "string contents" of each of those nodes. The returned list
187 has one entry per matching node. We assume that string contents of a
188 given node belong to the first C{TEXT_NODE} child of that node. Nodes
189 which have no C{TEXT_NODE} children are not represented in the returned
190 list.
191
192 @param parent: Parent node to search beneath.
193 @param name: Name of node to search for.
194
195 @return: List of strings as described above, or C{None} if no matching nodes are found.
196 """
197 lst = []
198 result = readChildren(parent, name)
199 for entry in result:
200 if entry.hasChildNodes():
201 for child in entry.childNodes:
202 if child.nodeType == Node.TEXT_NODE:
203 lst.append(child.nodeValue)
204 break
205 if lst == []:
206 lst = None
207 return lst
208
210 """
211 Returns string contents of the first child with a given name immediately
212 beneath the parent.
213
214 By "immediately beneath" the parent, we mean from among nodes that are
215 direct children of the passed-in parent node. We assume that string
216 contents of a given node belong to the first C{TEXT_NODE} child of that
217 node.
218
219 @param parent: Parent node to search beneath.
220 @param name: Name of node to search for.
221
222 @return: String contents of node or C{None} if no matching nodes are found.
223 """
224 result = readStringList(parent, name)
225 if result is None:
226 return None
227 return result[0]
228
230 """
231 Returns integer contents of the first child with a given name immediately
232 beneath the parent.
233
234 By "immediately beneath" the parent, we mean from among nodes that are
235 direct children of the passed-in parent node.
236
237 @param parent: Parent node to search beneath.
238 @param name: Name of node to search for.
239
240 @return: Integer contents of node or C{None} if no matching nodes are found.
241 @raise ValueError: If the string at the location can't be converted to an integer.
242 """
243 result = readString(parent, name)
244 if result is None:
245 return None
246 else:
247 return int(result)
248
250 """
251 Returns long integer contents of the first child with a given name immediately
252 beneath the parent.
253
254 By "immediately beneath" the parent, we mean from among nodes that are
255 direct children of the passed-in parent node.
256
257 @param parent: Parent node to search beneath.
258 @param name: Name of node to search for.
259
260 @return: Long integer contents of node or C{None} if no matching nodes are found.
261 @raise ValueError: If the string at the location can't be converted to an integer.
262 """
263 result = readString(parent, name)
264 if result is None:
265 return None
266 else:
267 return long(result)
268
270 """
271 Returns float contents of the first child with a given name immediately
272 beneath the parent.
273
274 By "immediately beneath" the parent, we mean from among nodes that are
275 direct children of the passed-in parent node.
276
277 @param parent: Parent node to search beneath.
278 @param name: Name of node to search for.
279
280 @return: Float contents of node or C{None} if no matching nodes are found.
281 @raise ValueError: If the string at the location can't be converted to a
282 float value.
283 """
284 result = readString(parent, name)
285 if result is None:
286 return None
287 else:
288 return float(result)
289
291 """
292 Returns boolean contents of the first child with a given name immediately
293 beneath the parent.
294
295 By "immediately beneath" the parent, we mean from among nodes that are
296 direct children of the passed-in parent node.
297
298 The string value of the node must be one of the values in L{VALID_BOOLEAN_VALUES}.
299
300 @param parent: Parent node to search beneath.
301 @param name: Name of node to search for.
302
303 @return: Boolean contents of node or C{None} if no matching nodes are found.
304 @raise ValueError: If the string at the location can't be converted to a boolean.
305 """
306 result = readString(parent, name)
307 if result is None:
308 return None
309 else:
310 if result in TRUE_BOOLEAN_VALUES:
311 return True
312 elif result in FALSE_BOOLEAN_VALUES:
313 return False
314 else:
315 raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES)
316
317
318
319
320
321
323 """
324 Adds a container node as the next child of a parent node.
325
326 @param xmlDom: DOM tree as from C{impl.createDocument()}.
327 @param parentNode: Parent node to create child for.
328 @param nodeName: Name of the new container node.
329
330 @return: Reference to the newly-created node.
331 """
332 containerNode = xmlDom.createElement(nodeName)
333 parentNode.appendChild(containerNode)
334 return containerNode
335
337 """
338 Adds a text node as the next child of a parent, to contain a string.
339
340 If the C{nodeValue} is None, then the node will be created, but will be
341 empty (i.e. will contain no text node child).
342
343 @param xmlDom: DOM tree as from C{impl.createDocument()}.
344 @param parentNode: Parent node to create child for.
345 @param nodeName: Name of the new container node.
346 @param nodeValue: The value to put into the node.
347
348 @return: Reference to the newly-created node.
349 """
350 containerNode = addContainerNode(xmlDom, parentNode, nodeName)
351 if nodeValue is not None:
352 textNode = xmlDom.createTextNode(nodeValue)
353 containerNode.appendChild(textNode)
354 return containerNode
355
357 """
358 Adds a text node as the next child of a parent, to contain an integer.
359
360 If the C{nodeValue} is None, then the node will be created, but will be
361 empty (i.e. will contain no text node child).
362
363 The integer will be converted to a string using "%d". The result will be
364 added to the document via L{addStringNode}.
365
366 @param xmlDom: DOM tree as from C{impl.createDocument()}.
367 @param parentNode: Parent node to create child for.
368 @param nodeName: Name of the new container node.
369 @param nodeValue: The value to put into the node.
370
371 @return: Reference to the newly-created node.
372 """
373 if nodeValue is None:
374 return addStringNode(xmlDom, parentNode, nodeName, None)
375 else:
376 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
377
378 -def addLongNode(xmlDom, parentNode, nodeName, nodeValue):
379 """
380 Adds a text node as the next child of a parent, to contain a long integer.
381
382 If the C{nodeValue} is None, then the node will be created, but will be
383 empty (i.e. will contain no text node child).
384
385 The integer will be converted to a string using "%d". The result will be
386 added to the document via L{addStringNode}.
387
388 @param xmlDom: DOM tree as from C{impl.createDocument()}.
389 @param parentNode: Parent node to create child for.
390 @param nodeName: Name of the new container node.
391 @param nodeValue: The value to put into the node.
392
393 @return: Reference to the newly-created node.
394 """
395 if nodeValue is None:
396 return addStringNode(xmlDom, parentNode, nodeName, None)
397 else:
398 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
399
401 """
402 Adds a text node as the next child of a parent, to contain a boolean.
403
404 If the C{nodeValue} is None, then the node will be created, but will be
405 empty (i.e. will contain no text node child).
406
407 Boolean C{True}, or anything else interpreted as C{True} by Python, will
408 be converted to a string "Y". Anything else will be converted to a
409 string "N". The result is added to the document via L{addStringNode}.
410
411 @param xmlDom: DOM tree as from C{impl.createDocument()}.
412 @param parentNode: Parent node to create child for.
413 @param nodeName: Name of the new container node.
414 @param nodeValue: The value to put into the node.
415
416 @return: Reference to the newly-created node.
417 """
418 if nodeValue is None:
419 return addStringNode(xmlDom, parentNode, nodeName, None)
420 else:
421 if nodeValue:
422 return addStringNode(xmlDom, parentNode, nodeName, "Y")
423 else:
424 return addStringNode(xmlDom, parentNode, nodeName, "N")
425
426
427
428
429
430
432 """
433 Serializes a DOM tree and returns the result in a string.
434 @param xmlDom: XML DOM tree to serialize
435 @param indent: Number of spaces to indent, as an integer
436 @return: String form of DOM tree, pretty-printed.
437 """
438 xmlBuffer = StringIO()
439 serializer = Serializer(xmlBuffer, "UTF-8", indent=indent)
440 serializer.serialize(xmlDom)
441 xmlData = xmlBuffer.getvalue()
442 xmlBuffer.close()
443 return xmlData
444
446
447 """
448 XML serializer class.
449
450 This is a customized serializer that I hacked together based on what I found
451 in the PyXML distribution. Basically, around release 2.7.0, the only reason
452 I still had around a dependency on PyXML was for the PrettyPrint
453 functionality, and that seemed pointless. So, I stripped the PrettyPrint
454 code out of PyXML and hacked bits of it off until it did just what I needed
455 and no more.
456
457 This code started out being called PrintVisitor, but I decided it makes more
458 sense just calling it a serializer. I've made nearly all of the methods
459 private, and I've added a new high-level serialize() method rather than
460 having clients call C{visit()}.
461
462 Anyway, as a consequence of my hacking with it, this can't quite be called a
463 complete XML serializer any more. I ripped out support for HTML and XHTML,
464 and there is also no longer any support for namespaces (which I took out
465 because this dragged along a lot of extra code, and Cedar Backup doesn't use
466 namespaces). However, everything else should pretty much work as expected.
467
468 @copyright: This code, prior to customization, was part of the PyXML
469 codebase, and before that was part of the 4DOM suite developed by
470 Fourthought, Inc. It its original form, it was Copyright (c) 2000
471 Fourthought Inc, USA; All Rights Reserved.
472 """
473
474 - def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3):
475 """
476 Initialize a serializer.
477 @param stream: Stream to write output to.
478 @param encoding: Output encoding.
479 @param indent: Number of spaces to indent, as an integer
480 """
481 self.stream = stream
482 self.encoding = encoding
483 self._indent = indent * " "
484 self._depth = 0
485 self._inText = 0
486
488 """
489 Serialize the passed-in XML document.
490 @param xmlDom: XML DOM tree to serialize
491 @raise ValueError: If there's an unknown node type in the document.
492 """
493 self._visit(xmlDom)
494 self.stream.write("\n")
495
500
502 if not self._inText and self._indent:
503 self._write('\n' + self._indent*self._depth)
504 return
505
507 """
508 @raise ValueError: If there's an unknown node type in the document.
509 """
510 if node.nodeType == Node.ELEMENT_NODE:
511 return self._visitElement(node)
512
513 elif node.nodeType == Node.ATTRIBUTE_NODE:
514 return self._visitAttr(node)
515
516 elif node.nodeType == Node.TEXT_NODE:
517 return self._visitText(node)
518
519 elif node.nodeType == Node.CDATA_SECTION_NODE:
520 return self._visitCDATASection(node)
521
522 elif node.nodeType == Node.ENTITY_REFERENCE_NODE:
523 return self._visitEntityReference(node)
524
525 elif node.nodeType == Node.ENTITY_NODE:
526 return self._visitEntity(node)
527
528 elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
529 return self._visitProcessingInstruction(node)
530
531 elif node.nodeType == Node.COMMENT_NODE:
532 return self._visitComment(node)
533
534 elif node.nodeType == Node.DOCUMENT_NODE:
535 return self._visitDocument(node)
536
537 elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
538 return self._visitDocumentType(node)
539
540 elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
541 return self._visitDocumentFragment(node)
542
543 elif node.nodeType == Node.NOTATION_NODE:
544 return self._visitNotation(node)
545
546
547 raise ValueError("Unknown node type: %s" % repr(node))
548
550 for curr in node:
551 curr is not exclude and self._visit(curr)
552 return
553
555 for item in node.values():
556 self._visit(item)
557 return
558
566
568 self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8'))
569 self._inText = 0
570 return
571
577
581
583 self._tryIndent()
584 self._write('<%s' % node.tagName)
585 for attr in node.attributes.values():
586 self._visitAttr(attr)
587 if len(node.childNodes):
588 self._write('>')
589 self._depth = self._depth + 1
590 self._visitNodeList(node.childNodes)
591 self._depth = self._depth - 1
592 not (self._inText) and self._tryIndent()
593 self._write('</%s>' % node.tagName)
594 else:
595 self._write('/>')
596 self._inText = 0
597 return
598
599 - def _visitText(self, node):
600 text = node.data
601 if self._indent:
602 text.strip()
603 if text:
604 text = _translateCDATA(text, self.encoding)
605 self.stream.write(text)
606 self._inText = 1
607 return
608
610 if not doctype.systemId and not doctype.publicId: return
611 self._tryIndent()
612 self._write('<!DOCTYPE %s' % doctype.name)
613 if doctype.systemId and '"' in doctype.systemId:
614 system = "'%s'" % doctype.systemId
615 else:
616 system = '"%s"' % doctype.systemId
617 if doctype.publicId and '"' in doctype.publicId:
618
619
620
621 public = "'%s'" % doctype.publicId
622 else:
623 public = '"%s"' % doctype.publicId
624 if doctype.publicId and doctype.systemId:
625 self._write(' PUBLIC %s %s' % (public, system))
626 elif doctype.systemId:
627 self._write(' SYSTEM %s' % system)
628 if doctype.entities or doctype.notations:
629 self._write(' [')
630 self._depth = self._depth + 1
631 self._visitNamedNodeMap(doctype.entities)
632 self._visitNamedNodeMap(doctype.notations)
633 self._depth = self._depth - 1
634 self._tryIndent()
635 self._write(']>')
636 else:
637 self._write('>')
638 self._inText = 0
639 return
640
642 """Visited from a NamedNodeMap in DocumentType"""
643 self._tryIndent()
644 self._write('<!ENTITY %s' % (node.nodeName))
645 node.publicId and self._write(' PUBLIC %s' % node.publicId)
646 node.systemId and self._write(' SYSTEM %s' % node.systemId)
647 node.notationName and self._write(' NDATA %s' % node.notationName)
648 self._write('>')
649 return
650
652 """Visited from a NamedNodeMap in DocumentType"""
653 self._tryIndent()
654 self._write('<!NOTATION %s' % node.nodeName)
655 node.publicId and self._write(' PUBLIC %s' % node.publicId)
656 node.systemId and self._write(' SYSTEM %s' % node.systemId)
657 self._write('>')
658 return
659
661 self._tryIndent()
662 self._write('<![CDATA[%s]]>' % (node.data))
663 self._inText = 0
664 return
665
671
673 self._write('&%s;' % node.nodeName)
674 self._inText = 1
675 return
676
678 self._tryIndent()
679 self._write('<?%s %s?>' % (node.target, node.data))
680 self._inText = 0
681 return
682
683 -def _encodeText(text, encoding):
684 """
685 @copyright: This code, prior to customization, was part of the PyXML
686 codebase, and before that was part of the 4DOM suite developed by
687 Fourthought, Inc. It its original form, it was attributed to Martin v.
688 Löwis and was Copyright (c) 2000 Fourthought Inc, USA; All Rights Reserved.
689 """
690 encoder = codecs.lookup(encoding)[0]
691 if type(text) is not UnicodeType:
692 text = unicode(text, "utf-8")
693 return encoder(text)[0]
694
696 """
697 Handles normalization and some intelligence about quoting.
698
699 @copyright: This code, prior to customization, was part of the PyXML
700 codebase, and before that was part of the 4DOM suite developed by
701 Fourthought, Inc. It its original form, it was Copyright (c) 2000
702 Fourthought Inc, USA; All Rights Reserved.
703 """
704 if not characters:
705 return '', "'"
706 if "'" in characters:
707 delimiter = '"'
708 new_chars = re.sub('"', '"', characters)
709 else:
710 delimiter = "'"
711 new_chars = re.sub("'", ''', characters)
712
713
714
715 if "\n" in characters:
716 new_chars = re.sub('\n', ' ', new_chars)
717 return new_chars, delimiter
718
719
720 -def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0):
721 """
722 @copyright: This code, prior to customization, was part of the PyXML
723 codebase, and before that was part of the 4DOM suite developed by
724 Fourthought, Inc. It its original form, it was Copyright (c) 2000
725 Fourthought Inc, USA; All Rights Reserved.
726 """
727 CDATA_CHAR_PATTERN = re.compile('[&<]|]]>')
728 CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', }
729 ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
730 ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
731 XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
732 if not characters:
733 return ''
734 if not markupSafe:
735 if CDATA_CHAR_PATTERN.search(characters):
736 new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0]
737 else:
738 new_string = characters
739 if prev_chars[-2:] == ']]' and characters[0] == '>':
740 new_string = '>' + new_string[1:]
741 else:
742 new_string = characters
743
744
745
746 if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
747 new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0]
748 new_string = _encodeText(new_string, encoding)
749 return new_string
750