# -*- coding: iso-8859-1 -*-
# vim: set ft=python ts=3 sw=3 expandtab:
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# C E D A R
# S O L U T I O N S "Software done right."
# S O F T W A R E
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# Copyright (c) 2004-2006,2010,2015 Kenneth J. Pronovici.
# All rights reserved.
#
# Portions Copyright (c) 2000 Fourthought Inc, USA.
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# Version 2, as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Copies of the GNU General Public License are available from
# the Free Software Foundation website, http://www.gnu.org/.
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# Author : Kenneth J. Pronovici <pronovic@ieee.org>
# Language : Python 3 (>= 3.4)
# Project : Cedar Backup, release 3
# Purpose : Provides general XML-related functionality.
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
########################################################################
# Module documentation
########################################################################
"""
Provides general XML-related functionality.
What I'm trying to do here is abstract much of the functionality that directly
accesses the DOM tree. This is not so much to "protect" the other code from
the DOM, but to standardize the way it's used. It will also help extension
authors write code that easily looks more like the rest of Cedar Backup.
Module Attributes
=================
Attributes:
TRUE_BOOLEAN_VALUES: List of boolean values in XML representing ``True``
FALSE_BOOLEAN_VALUES: List of boolean values in XML representing ``False``
VALID_BOOLEAN_VALUES: List of valid boolean values in XML
:author: Kenneth J. Pronovici <pronovic@ieee.org>
"""
# pylint: disable=C0111,C0103,W0511,W0104,W0106
########################################################################
# Imported modules
########################################################################
# System modules
import sys
import re
import logging
from io import StringIO
# XML-related modules
from xml.parsers.expat import ExpatError
from xml.dom.minidom import Node
from xml.dom.minidom import getDOMImplementation
from xml.dom.minidom import parseString
########################################################################
# Module-wide constants and variables
########################################################################
logger = logging.getLogger("CedarBackup3.log.xml")
TRUE_BOOLEAN_VALUES = [ "Y", "y", ]
FALSE_BOOLEAN_VALUES = [ "N", "n", ]
VALID_BOOLEAN_VALUES = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES
########################################################################
# Functions for creating and parsing DOM trees
########################################################################
[docs]def createOutputDom(name="cb_config"):
"""
Creates a DOM tree used for writing an XML document.
Args:
name: Base name of the document (root node name)
Returns:
Tuple (xmlDom, parentNode) for the new document
"""
impl = getDOMImplementation()
xmlDom = impl.createDocument(None, name, None)
return (xmlDom, xmlDom.documentElement)
########################################################################
# Functions for reading values out of XML documents
########################################################################
[docs]def isElement(node):
"""
Returns True or False depending on whether the XML node is an element node.
"""
return node.nodeType == Node.ELEMENT_NODE
[docs]def readChildren(parent, name):
"""
Returns a list of nodes with a given name immediately beneath the
parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
Underneath, we use the Python ``getElementsByTagName`` method, which is
pretty cool, but which (surprisingly?) returns a list of all children
with a given name below the parent, at any level. We just prune that
list to include only children whose ``parentNode`` matches the passed-in
parent.
Args:
parent: Parent node to search beneath
name: Name of nodes to search for
Returns:
List of child nodes with correct parent, or an empty list if
no matching nodes are found.
"""
lst = []
if parent is not None:
result = parent.getElementsByTagName(name)
for entry in result:
if entry.parentNode is parent:
lst.append(entry)
return lst
[docs]def readFirstChild(parent, name):
"""
Returns the first child with a given name immediately beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
First properly-named child of parent, or ``None`` if no matching nodes are found
"""
result = readChildren(parent, name)
if result is None or result == []:
return None
return result[0]
[docs]def readStringList(parent, name):
"""
Returns a list of the string contents associated with nodes with a given
name immediately beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
First, we find all of the nodes using :any:`readChildren`, and then we
retrieve the "string contents" of each of those nodes. The returned list
has one entry per matching node. We assume that string contents of a
given node belong to the first ``TEXT_NODE`` child of that node. Nodes
which have no ``TEXT_NODE`` children are not represented in the returned
list.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
List of strings as described above, or ``None`` if no matching nodes are found
"""
lst = []
result = readChildren(parent, name)
for entry in result:
if entry.hasChildNodes():
for child in entry.childNodes:
if child.nodeType == Node.TEXT_NODE:
lst.append(child.nodeValue)
break
if lst == []:
lst = None
return lst
[docs]def readString(parent, name):
"""
Returns string contents of the first child with a given name immediately
beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node. We assume that string
contents of a given node belong to the first ``TEXT_NODE`` child of that
node.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
String contents of node or ``None`` if no matching nodes are found
"""
result = readStringList(parent, name)
if result is None:
return None
return result[0]
[docs]def readInteger(parent, name):
"""
Returns integer contents of the first child with a given name immediately
beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
Integer contents of node or ``None`` if no matching nodes are found
Raises:
ValueError: If the string at the location can't be converted to an integer
"""
result = readString(parent, name)
if result is None:
return None
else:
return int(result)
[docs]def readLong(parent, name):
"""
Returns long integer contents of the first child with a given name immediately
beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
Long integer contents of node or ``None`` if no matching nodes are found
Raises:
ValueError: If the string at the location can't be converted to an integer
"""
result = readString(parent, name)
if result is None:
return None
else:
return int(result)
[docs]def readFloat(parent, name):
"""
Returns float contents of the first child with a given name immediately
beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
Float contents of node or ``None`` if no matching nodes are found
Raises:
ValueError: If the string at the location can't be converted to a
float value.
"""
result = readString(parent, name)
if result is None:
return None
else:
return float(result)
[docs]def readBoolean(parent, name):
"""
Returns boolean contents of the first child with a given name immediately
beneath the parent.
By "immediately beneath" the parent, we mean from among nodes that are
direct children of the passed-in parent node.
The string value of the node must be one of the values in :any:`VALID_BOOLEAN_VALUES`.
Args:
parent: Parent node to search beneath
name: Name of node to search for
Returns:
Boolean contents of node or ``None`` if no matching nodes are found
Raises:
ValueError: If the string at the location can't be converted to a boolean
"""
result = readString(parent, name)
if result is None:
return None
else:
if result in TRUE_BOOLEAN_VALUES:
return True
elif result in FALSE_BOOLEAN_VALUES:
return False
else:
raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES)
########################################################################
# Functions for writing values into XML documents
########################################################################
[docs]def addContainerNode(xmlDom, parentNode, nodeName):
"""
Adds a container node as the next child of a parent node.
Args:
xmlDom: DOM tree as from ``impl.createDocument()``
parentNode: Parent node to create child for
nodeName: Name of the new container node
Returns:
Reference to the newly-created node
"""
containerNode = xmlDom.createElement(nodeName)
parentNode.appendChild(containerNode)
return containerNode
[docs]def addStringNode(xmlDom, parentNode, nodeName, nodeValue):
"""
Adds a text node as the next child of a parent, to contain a string.
If the ``nodeValue`` is None, then the node will be created, but will be
empty (i.e. will contain no text node child).
Args:
xmlDom: DOM tree as from ``impl.createDocument()``
parentNode: Parent node to create child for
nodeName: Name of the new container node
nodeValue: The value to put into the node
Returns:
Reference to the newly-created node
"""
containerNode = addContainerNode(xmlDom, parentNode, nodeName)
if nodeValue is not None:
textNode = xmlDom.createTextNode(nodeValue)
containerNode.appendChild(textNode)
return containerNode
[docs]def addIntegerNode(xmlDom, parentNode, nodeName, nodeValue):
"""
Adds a text node as the next child of a parent, to contain an integer.
If the ``nodeValue`` is None, then the node will be created, but will be
empty (i.e. will contain no text node child).
The integer will be converted to a string using "%d". The result will be
added to the document via :any:`addStringNode`.
Args:
xmlDom: DOM tree as from ``impl.createDocument()``
parentNode: Parent node to create child for
nodeName: Name of the new container node
nodeValue: The value to put into the node
Returns:
Reference to the newly-created node
"""
if nodeValue is None:
return addStringNode(xmlDom, parentNode, nodeName, None)
else:
return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue) # %d works for both int and long
[docs]def addLongNode(xmlDom, parentNode, nodeName, nodeValue):
"""
Adds a text node as the next child of a parent, to contain a long integer.
If the ``nodeValue`` is None, then the node will be created, but will be
empty (i.e. will contain no text node child).
The integer will be converted to a string using "%d". The result will be
added to the document via :any:`addStringNode`.
Args:
xmlDom: DOM tree as from ``impl.createDocument()``
parentNode: Parent node to create child for
nodeName: Name of the new container node
nodeValue: The value to put into the node
Returns:
Reference to the newly-created node
"""
if nodeValue is None:
return addStringNode(xmlDom, parentNode, nodeName, None)
else:
return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue) # %d works for both int and long
[docs]def addBooleanNode(xmlDom, parentNode, nodeName, nodeValue):
"""
Adds a text node as the next child of a parent, to contain a boolean.
If the ``nodeValue`` is None, then the node will be created, but will be
empty (i.e. will contain no text node child).
Boolean ``True``, or anything else interpreted as ``True`` by Python, will
be converted to a string "Y". Anything else will be converted to a
string "N". The result is added to the document via :any:`addStringNode`.
Args:
xmlDom: DOM tree as from ``impl.createDocument()``
parentNode: Parent node to create child for
nodeName: Name of the new container node
nodeValue: The value to put into the node
Returns:
Reference to the newly-created node
"""
if nodeValue is None:
return addStringNode(xmlDom, parentNode, nodeName, None)
else:
if nodeValue:
return addStringNode(xmlDom, parentNode, nodeName, "Y")
else:
return addStringNode(xmlDom, parentNode, nodeName, "N")
########################################################################
# Functions for serializing DOM trees
########################################################################
[docs]def serializeDom(xmlDom, indent=3):
"""
Serializes a DOM tree and returns the result in a string.
Args:
xmlDom: XML DOM tree to serialize
indent: Number of spaces to indent, as an integer
Returns:
String form of DOM tree, pretty-printed
"""
xmlBuffer = StringIO()
serializer = Serializer(xmlBuffer, "UTF-8", indent=indent)
serializer.serialize(xmlDom)
xmlData = xmlBuffer.getvalue()
xmlBuffer.close()
return xmlData
[docs]class Serializer(object):
"""
XML serializer class.
This is a customized serializer that I hacked together based on what I found
in the PyXML distribution. Basically, around release 2.7.0, the only reason
I still had around a dependency on PyXML was for the PrettyPrint
functionality, and that seemed pointless. So, I stripped the PrettyPrint
code out of PyXML and hacked bits of it off until it did just what I needed
and no more.
This code started out being called PrintVisitor, but I decided it makes more
sense just calling it a serializer. I've made nearly all of the methods
private, and I've added a new high-level serialize() method rather than
having clients call ``visit()``.
Anyway, as a consequence of my hacking with it, this can't quite be called a
complete XML serializer any more. I ripped out support for HTML and XHTML,
and there is also no longer any support for namespaces (which I took out
because this dragged along a lot of extra code, and Cedar Backup doesn't use
namespaces). However, everything else should pretty much work as expected.
@copyright: This code, prior to customization, was part of the PyXML
codebase, and before that was part of the 4DOM suite developed by
Fourthought, Inc. It its original form, it was Copyright (c) 2000
Fourthought Inc, USA; All Rights Reserved.
"""
[docs] def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3):
"""
Initialize a serializer.
Args:
stream: Stream to write output to
encoding: Output encoding
indent: Number of spaces to indent, as an integer
"""
self.stream = stream
self.encoding = encoding
self._indent = indent * " "
self._depth = 0
self._inText = 0
[docs] def serialize(self, xmlDom):
"""
Serialize the passed-in XML document.
Args:
xmlDom: XML DOM tree to serialize
Raises:
ValueError: If there's an unknown node type in the document
"""
self._visit(xmlDom)
self.stream.write("\n")
def _write(self, text):
obj = _encodeText(text, self.encoding)
self.stream.write(obj)
return
def _tryIndent(self):
if not self._inText and self._indent:
self._write('\n' + self._indent*self._depth)
return
def _visit(self, node):
"""
Raises:
ValueError: If there's an unknown node type in the document
"""
if node.nodeType == Node.ELEMENT_NODE:
return self._visitElement(node)
elif node.nodeType == Node.ATTRIBUTE_NODE:
return self._visitAttr(node)
elif node.nodeType == Node.TEXT_NODE:
return self._visitText(node)
elif node.nodeType == Node.CDATA_SECTION_NODE:
return self._visitCDATASection(node)
elif node.nodeType == Node.ENTITY_REFERENCE_NODE:
return self._visitEntityReference(node)
elif node.nodeType == Node.ENTITY_NODE:
return self._visitEntity(node)
elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
return self._visitProcessingInstruction(node)
elif node.nodeType == Node.COMMENT_NODE:
return self._visitComment(node)
elif node.nodeType == Node.DOCUMENT_NODE:
return self._visitDocument(node)
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
return self._visitDocumentType(node)
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
return self._visitDocumentFragment(node)
elif node.nodeType == Node.NOTATION_NODE:
return self._visitNotation(node)
# It has a node type, but we don't know how to handle it
raise ValueError("Unknown node type: %s" % repr(node))
def _visitNodeList(self, node, exclude=None):
for curr in node:
curr is not exclude and self._visit(curr)
return
def _visitNamedNodeMap(self, node):
for item in list(node.values()):
self._visit(item)
return
def _visitAttr(self, node):
self._write(' ' + node.name)
value = node.value
text = _translateCDATA(value, self.encoding)
text, delimiter = _translateCDATAAttr(text)
self.stream.write("=%s%s%s" % (delimiter, text, delimiter))
return
def _visitProlog(self):
self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8'))
self._inText = 0
return
def _visitDocument(self, node):
self._visitProlog()
node.doctype and self._visitDocumentType(node.doctype)
self._visitNodeList(node.childNodes, exclude=node.doctype)
return
def _visitDocumentFragment(self, node):
self._visitNodeList(node.childNodes)
return
def _visitElement(self, node):
self._tryIndent()
self._write('<%s' % node.tagName)
for attr in list(node.attributes.values()):
self._visitAttr(attr)
if len(node.childNodes):
self._write('>')
self._depth = self._depth + 1
self._visitNodeList(node.childNodes)
self._depth = self._depth - 1
not (self._inText) and self._tryIndent()
self._write('</%s>' % node.tagName)
else:
self._write('/>')
self._inText = 0
return
def _visitText(self, node):
text = node.data
if self._indent:
text.strip()
if text:
text = _translateCDATA(text, self.encoding)
self.stream.write(text)
self._inText = 1
return
def _visitDocumentType(self, doctype):
if not doctype.systemId and not doctype.publicId: return
self._tryIndent()
self._write('<!DOCTYPE %s' % doctype.name)
if doctype.systemId and '"' in doctype.systemId:
system = "'%s'" % doctype.systemId
else:
system = '"%s"' % doctype.systemId
if doctype.publicId and '"' in doctype.publicId:
# We should probably throw an error
# Valid characters: <space> | <newline> | <linefeed> |
# [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
public = "'%s'" % doctype.publicId
else:
public = '"%s"' % doctype.publicId
if doctype.publicId and doctype.systemId:
self._write(' PUBLIC %s %s' % (public, system))
elif doctype.systemId:
self._write(' SYSTEM %s' % system)
if doctype.entities or doctype.notations:
self._write(' [')
self._depth = self._depth + 1
self._visitNamedNodeMap(doctype.entities)
self._visitNamedNodeMap(doctype.notations)
self._depth = self._depth - 1
self._tryIndent()
self._write(']>')
else:
self._write('>')
self._inText = 0
return
def _visitEntity(self, node):
"""Visited from a NamedNodeMap in DocumentType"""
self._tryIndent()
self._write('<!ENTITY %s' % (node.nodeName))
node.publicId and self._write(' PUBLIC %s' % node.publicId)
node.systemId and self._write(' SYSTEM %s' % node.systemId)
node.notationName and self._write(' NDATA %s' % node.notationName)
self._write('>')
return
def _visitNotation(self, node):
"""Visited from a NamedNodeMap in DocumentType"""
self._tryIndent()
self._write('<!NOTATION %s' % node.nodeName)
node.publicId and self._write(' PUBLIC %s' % node.publicId)
node.systemId and self._write(' SYSTEM %s' % node.systemId)
self._write('>')
return
def _visitCDATASection(self, node):
self._tryIndent()
self._write('<![CDATA[%s]]>' % (node.data))
self._inText = 0
return
def _visitComment(self, node):
self._tryIndent()
self._write('<!--%s-->' % (node.data))
self._inText = 0
return
def _visitEntityReference(self, node):
self._write('&%s;' % node.nodeName)
self._inText = 1
return
def _visitProcessingInstruction(self, node):
self._tryIndent()
self._write('<?%s %s?>' % (node.target, node.data))
self._inText = 0
return
# pylint: disable=W0613
def _encodeText(text, encoding):
"""Safely encodes the passed-in text as a Unicode string, converting bytes to UTF-8 if necessary."""
if text is None:
return text
try:
if isinstance(text, bytes):
text = str(text, "utf-8")
return text
except UnicodeError:
raise ValueError("Path could not be safely encoded as utf-8.")
def _translateCDATAAttr(characters):
"""
Handles normalization and some intelligence about quoting.
@copyright: This code, prior to customization, was part of the PyXML
codebase, and before that was part of the 4DOM suite developed by
Fourthought, Inc. It its original form, it was Copyright (c) 2000
Fourthought Inc, USA; All Rights Reserved.
"""
if not characters:
return '', "'"
if "'" in characters:
delimiter = '"'
new_chars = re.sub('"', '"', characters)
else:
delimiter = "'"
new_chars = re.sub("'", ''', characters)
#FIXME: There's more to normalization
#Convert attribute new-lines to character entity
# characters is possibly shorter than new_chars (no entities)
if "\n" in characters:
new_chars = re.sub('\n', ' ', new_chars)
return new_chars, delimiter
#Note: Unicode object only for now
def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0):
"""
@copyright: This code, prior to customization, was part of the PyXML
codebase, and before that was part of the 4DOM suite developed by
Fourthought, Inc. It its original form, it was Copyright (c) 2000
Fourthought Inc, USA; All Rights Reserved.
"""
CDATA_CHAR_PATTERN = re.compile('[&<]|]]>')
CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', }
ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
if not characters:
return ''
if not markupSafe:
if CDATA_CHAR_PATTERN.search(characters):
new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0]
else:
new_string = characters
if prev_chars[-2:] == ']]' and characters[0] == '>':
new_string = '>' + new_string[1:]
else:
new_string = characters
#Note: use decimal char entity rep because some browsers are broken
#FIXME: This will bomb for high characters. Should, for instance, detect
#The UTF-8 for 0xFFFE and put out 
if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0]
new_string = _encodeText(new_string, encoding)
return new_string