src/ElementTreePython2.py

   1 #
   2 # ElementTree
   3 # $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
   4 #
   5 # light-weight XML support for Python 1.5.2 and later.
   6 #
   7 # history:
   8 # 2001-10-20 fl   created (from various sources)
   9 # 2001-11-01 fl   return root from parse method
  10 # 2002-02-16 fl   sort attributes in lexical order
  11 # 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
  12 # 2002-05-01 fl   finished TreeBuilder refactoring
  13 # 2002-07-14 fl   added basic namespace support to ElementTree.write
  14 # 2002-07-25 fl   added QName attribute support
  15 # 2002-10-20 fl   fixed encoding in write
  16 # 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
  17 # 2002-11-27 fl   accept file objects or file names for parse/write
  18 # 2002-12-04 fl   moved XMLTreeBuilder back to this module
  19 # 2003-01-11 fl   fixed entity encoding glitch for us-ascii
  20 # 2003-02-13 fl   added XML literal factory
  21 # 2003-02-21 fl   added ProcessingInstruction/PI factory
  22 # 2003-05-11 fl   added tostring/fromstring helpers
  23 # 2003-05-26 fl   added ElementPath support
  24 # 2003-07-05 fl   added makeelement factory method
  25 # 2003-07-28 fl   added more well-known namespace prefixes
  26 # 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
  27 # 2003-09-04 fl   fall back on emulator if ElementPath is not installed
  28 # 2003-10-31 fl   markup updates
  29 # 2003-11-15 fl   fixed nested namespace bug
  30 # 2004-03-28 fl   added XMLID helper
  31 # 2004-06-02 fl   added default support to findtext
  32 # 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
  33 # 2004-08-23 fl   take advantage of post-2.1 expat features
  34 # 2005-02-01 fl   added iterparse implementation
  35 # 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
  36 #
  37 # Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
  38 #
  39 # fredrik@pythonware.com
  40 # http://www.pythonware.com
  41 #
  42 # --------------------------------------------------------------------
  43 # The ElementTree toolkit is
  44 #
  45 # Copyright (c) 1999-2005 by Fredrik Lundh
  46 #
  47 # By obtaining, using, and/or copying this software and/or its
  48 # associated documentation, you agree that you have read, understood,
  49 # and will comply with the following terms and conditions:
  50 #
  51 # Permission to use, copy, modify, and distribute this software and
  52 # its associated documentation for any purpose and without fee is
  53 # hereby granted, provided that the above copyright notice appears in
  54 # all copies, and that both that copyright notice and this permission
  55 # notice appear in supporting documentation, and that the name of
  56 # Secret Labs AB or the author not be used in advertising or publicity
  57 # pertaining to distribution of the software without specific, written
  58 # prior permission.
  59 #
  60 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  61 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  62 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  63 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  64 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  65 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  66 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  67 # OF THIS SOFTWARE.
  68 # --------------------------------------------------------------------
  69
  70 __all__ = [
  71     # public symbols
  72     "Comment",
  73     "dump",
  74     "Element", "ElementTree",
  75     "fromstring",
  76     "iselement", "iterparse",
  77     "parse",
  78     "PI", "ProcessingInstruction",
  79     "QName",
  80     "SubElement",
  81     "tostring",
  82     "TreeBuilder",
  83     "VERSION", "XML",
  84     "XMLTreeBuilder",
  85     ]
  86
  87 ##
  88 # The <b>Element</b> type is a flexible container object, designed to
  89 # store hierarchical data structures in memory. The type can be
  90 # described as a cross between a list and a dictionary.
  91 # <p>
  92 # Each element has a number of properties associated with it:
  93 # <ul>
  94 # <li>a <i>tag</i>. This is a string identifying what kind of data
  95 # this element represents (the element type, in other words).</li>
  96 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
  97 # <li>a <i>text</i> string.</li>
  98 # <li>an optional <i>tail</i> string.</li>
  99 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>
 100 # </ul>
 101 #
 102 # To create an element instance, use the {@link #Element} or {@link
 103 # #SubElement} factory functions.
 104 # <p>
 105 # The {@link #ElementTree} class can be used to wrap an element
 106 # structure, and convert it from and to XML.
 107 ##
 108
 109 import string, sys, re, platform
 110
 111 class _SimpleElementPath:
 112     # emulate pre-1.2 find/findtext/findall behaviour
 113     def find(self, element, tag):
 114         for elem in element:
 115             if elem.tag == tag:
 116                 return elem
 117         return None
 118     def findtext(self, element, tag, default=None):
 119         for elem in element:
 120             if elem.tag == tag:
 121                 return elem.text or ""
 122         return default
 123     def findall(self, element, tag):
 124         if tag[:3] == ".//":
 125             return element.getiterator(tag[3:])
 126         result = []
 127         for elem in element:
 128             if elem.tag == tag:
 129                 result.append(elem)
 130         return result
 131
 132 """
 133 # obsolete
 134 # ElementPath.py is for python3 2019
 135 # file inexisting in sat before 2019
 136 try:
 137     import ElementPath
 138 except ImportError:
 139     # FIXME: issue warning in this case?
 140     ElementPath = _SimpleElementPath()
 141 """
 142 ElementPath = _SimpleElementPath() # before 2019 python2 situation sat5.0
 143
 144 # TODO: add support for custom namespace resolvers/default namespaces
 145 # TODO: add improved support for incremental parsing
 146
 147 VERSION = "1.2.6"
 148
 149 ##
 150 # Internal element class.  This class defines the Element interface,
 151 # and provides a reference implementation of this interface.
 152 # <p>
 153 # You should not create instances of this class directly.  Use the
 154 # appropriate factory functions instead, such as {@link #Element}
 155 # and {@link #SubElement}.
 156 #
 157 # @see Element
 158 # @see SubElement
 159 # @see Comment
 160 # @see ProcessingInstruction
 161
 162 class _ElementInterface:
 163     # <tag attrib>text<child/>...</tag>tail
 164
 165     ##
 166     # (Attribute) Element tag.
 167
 168     tag = None
 169
 170     ##
 171     # (Attribute) Element attribute dictionary.  Where possible, use
 172     # {@link #_ElementInterface.get},
 173     # {@link #_ElementInterface.set},
 174     # {@link #_ElementInterface.keys}, and
 175     # {@link #_ElementInterface.items} to access
 176     # element attributes.
 177
 178     attrib = None
 179
 180     ##
 181     # (Attribute) Text before first subelement.  This is either a
 182     # string or the value None, if there was no text.
 183
 184     text = None
 185
 186     ##
 187     # (Attribute) Text after this element's end tag, but before the
 188     # next sibling element's start tag.  This is either a string or
 189     # the value None, if there was no text.
 190
 191     tail = None # text after end tag, if any
 192
 193     def __init__(self, tag, attrib):
 194         self.tag = tag
 195         self.attrib = attrib
 196         self._children = []
 197
 198     def __repr__(self):
 199         return "<Element %s at %x>" % (self.tag, id(self))
 200
 201     ##
 202     # Creates a new element object of the same type as this element.
 203     #
 204     # @param tag Element tag.
 205     # @param attrib Element attributes, given as a dictionary.
 206     # @return A new element instance.
 207
 208     def makeelement(self, tag, attrib):
 209         return Element(tag, attrib)
 210
 211     ##
 212     # Returns the number of subelements.
 213     #
 214     # @return The number of subelements.
 215
 216     def __len__(self):
 217         return len(self._children)
 218
 219     ##
 220     # Returns the given subelement.
 221     #
 222     # @param index What subelement to return.
 223     # @return The given subelement.
 224     # @exception IndexError If the given element does not exist.
 225
 226     def __getitem__(self, index):
 227         return self._children[index]
 228
 229     ##
 230     # Replaces the given subelement.
 231     #
 232     # @param index What subelement to replace.
 233     # @param element The new element value.
 234     # @exception IndexError If the given element does not exist.
 235     # @exception AssertionError If element is not a valid object.
 236
 237     def __setitem__(self, index, element):
 238         assert iselement(element)
 239         self._children[index] = element
 240
 241     ##
 242     # Deletes the given subelement.
 243     #
 244     # @param index What subelement to delete.
 245     # @exception IndexError If the given element does not exist.
 246
 247     def __delitem__(self, index):
 248         del self._children[index]
 249
 250     ##
 251     # Returns a list containing subelements in the given range.
 252     #
 253     # @param start The first subelement to return.
 254     # @param stop The first subelement that shouldn't be returned.
 255     # @return A sequence object containing subelements.
 256
 257     def __getslice__(self, start, stop):
 258         return self._children[start:stop]
 259
 260     ##
 261     # Replaces a number of subelements with elements from a sequence.
 262     #
 263     # @param start The first subelement to replace.
 264     # @param stop The first subelement that shouldn't be replaced.
 265     # @param elements A sequence object with zero or more elements.
 266     # @exception AssertionError If a sequence member is not a valid object.
 267
 268     def __setslice__(self, start, stop, elements):
 269         for element in elements:
 270             assert iselement(element)
 271         self._children[start:stop] = list(elements)
 272
 273     ##
 274     # Deletes a number of subelements.
 275     #
 276     # @param start The first subelement to delete.
 277     # @param stop The first subelement to leave in there.
 278
 279     def __delslice__(self, start, stop):
 280         del self._children[start:stop]
 281
 282     ##
 283     # Adds a subelement to the end of this element.
 284     #
 285     # @param element The element to add.
 286     # @exception AssertionError If a sequence member is not a valid object.
 287
 288     def append(self, element):
 289         assert iselement(element)
 290         self._children.append(element)
 291
 292     ##
 293     # Inserts a subelement at the given position in this element.
 294     #
 295     # @param index Where to insert the new subelement.
 296     # @exception AssertionError If the element is not a valid object.
 297
 298     def insert(self, index, element):
 299         assert iselement(element)
 300         self._children.insert(index, element)
 301
 302     ##
 303     # Removes a matching subelement.  Unlike the <b>find</b> methods,
 304     # this method compares elements based on identity, not on tag
 305     # value or contents.
 306     #
 307     # @param element What element to remove.
 308     # @exception ValueError If a matching element could not be found.
 309     # @exception AssertionError If the element is not a valid object.
 310
 311     def remove(self, element):
 312         assert iselement(element)
 313         self._children.remove(element)
 314
 315     ##
 316     # Returns all subelements.  The elements are returned in document
 317     # order.
 318     #
 319     # @return A list of subelements.
 320     # @defreturn list of Element instances
 321
 322     def getchildren(self):
 323         return self._children
 324
 325     ##
 326     # Finds the first matching subelement, by tag name or path.
 327     #
 328     # @param path What element to look for.
 329     # @return The first matching element, or None if no element was found.
 330     # @defreturn Element or None
 331
 332     def find(self, path):
 333         if ElementPath.find(self, path) == None:
 334             return ElementPath.find(self, path.encode())
 335         return ElementPath.find(self, path)
 336
 337     ##
 338     # Finds text for the first matching subelement, by tag name or path.
 339     #
 340     # @param path What element to look for.
 341     # @param default What to return if the element was not found.
 342     # @return The text content of the first matching element, or the
 343     #     default value no element was found.  Note that if the element
 344     #     has is found, but has no text content, this method returns an
 345     #     empty string.
 346     # @defreturn string
 347
 348     def findtext(self, path, default=None):
 349         return ElementPath.findtext(self, path, default)
 350
 351     ##
 352     # Finds all matching subelements, by tag name or path.
 353     #
 354     # @param path What element to look for.
 355     # @return A list or iterator containing all matching elements,
 356     #    in document order.
 357     # @defreturn list of Element instances
 358
 359     def findall(self, path):
 360         return ElementPath.findall(self, path)
 361
 362     ##
 363     # Resets an element.  This function removes all subelements, clears
 364     # all attributes, and sets the text and tail attributes to None.
 365
 366     def clear(self):
 367         self.attrib.clear()
 368         self._children = []
 369         self.text = self.tail = None
 370
 371     ##
 372     # Gets an element attribute.
 373     #
 374     # @param key What attribute to look for.
 375     # @param default What to return if the attribute was not found.
 376     # @return The attribute value, or the default value, if the
 377     #     attribute was not found.
 378     # @defreturn string or None
 379
 380     def get(self, key, default=None):
 381         res = self.attrib.get(key, default)
 382         if not res:
 383             res = self.attrib.get(key.encode(), default)
 384         if isinstance(res, bytes):
 385             return res.decode()
 386         else:
 387             return res
 388
 389     ##
 390     # Sets an element attribute.
 391     #
 392     # @param key What attribute to set.
 393     # @param value The attribute value.
 394
 395     def set(self, key, value):
 396         self.attrib[key] = value
 397
 398     ##
 399     # Gets a list of attribute names.  The names are returned in an
 400     # arbitrary order (just like for an ordinary Python dictionary).
 401     #
 402     # @return A list of element attribute names.
 403     # @defreturn list of strings
 404
 405     def keys(self):
 406         res = []
 407         for key in self.attrib.keys():
 408             if isinstance(key, bytes):
 409                 res.append(key.decode())
 410             else:
 411                 res.append(key)
 412         return res
 413
 414     ##
 415     # Gets element attributes, as a sequence.  The attributes are
 416     # returned in an arbitrary order.
 417     #
 418     # @return A list of (name, value) tuples for all attributes.
 419     # @defreturn list of (string, string) tuples
 420
 421     def items(self):
 422         return self.attrib.items()
 423
 424     ##
 425     # Creates a tree iterator.  The iterator loops over this element
 426     # and all subelements, in document order, and returns all elements
 427     # with a matching tag.
 428     # <p>
 429     # If the tree structure is modified during iteration, the result
 430     # is undefined.
 431     #
 432     # @param tag What tags to look for (default is to return all elements).
 433     # @return A list or iterator containing all the matching elements.
 434     # @defreturn list or iterator
 435
 436     def getiterator(self, tag=None):
 437         nodes = []
 438         if tag == "*":
 439             tag = None
 440         if tag is None or self.tag == tag:
 441             nodes.append(self)
 442         for node in self._children:
 443             nodes.extend(node.getiterator(tag))
 444         return nodes
 445
 446 # compatibility
 447 _Element = _ElementInterface
 448
 449 ##
 450 # Element factory.  This function returns an object implementing the
 451 # standard Element interface.  The exact class or type of that object
 452 # is implementation dependent, but it will always be compatible with
 453 # the {@link #_ElementInterface} class in this module.
 454 # <p>
 455 # The element name, attribute names, and attribute values can be
 456 # either 8-bit ASCII strings or Unicode strings.
 457 #
 458 # @param tag The element name.
 459 # @param attrib An optional dictionary, containing element attributes.
 460 # @param **extra Additional attributes, given as keyword arguments.
 461 # @return An element instance.
 462 # @defreturn Element
 463
 464 def Element(tag, attrib={}, **extra):
 465     attrib = attrib.copy()
 466     attrib.update(extra)
 467     return _ElementInterface(tag, attrib)
 468
 469 ##
 470 # Subelement factory.  This function creates an element instance, and
 471 # appends it to an existing element.
 472 # <p>
 473 # The element name, attribute names, and attribute values can be
 474 # either 8-bit ASCII strings or Unicode strings.
 475 #
 476 # @param parent The parent element.
 477 # @param tag The subelement name.
 478 # @param attrib An optional dictionary, containing element attributes.
 479 # @param **extra Additional attributes, given as keyword arguments.
 480 # @return An element instance.
 481 # @defreturn Element
 482
 483 def SubElement(parent, tag, attrib={}, **extra):
 484     attrib = attrib.copy()
 485     attrib.update(extra)
 486     element = parent.makeelement(tag, attrib)
 487     parent.append(element)
 488     return element
 489
 490 ##
 491 # Comment element factory.  This factory function creates a special
 492 # element that will be serialized as an XML comment.
 493 # <p>
 494 # The comment string can be either an 8-bit ASCII string or a Unicode
 495 # string.
 496 #
 497 # @param text A string containing the comment string.
 498 # @return An element instance, representing a comment.
 499 # @defreturn Element
 500
 501 def Comment(text=None):
 502     element = Element(Comment)
 503     element.text = text
 504     return element
 505
 506 ##
 507 # PI element factory.  This factory function creates a special element
 508 # that will be serialized as an XML processing instruction.
 509 #
 510 # @param target A string containing the PI target.
 511 # @param text A string containing the PI contents, if any.
 512 # @return An element instance, representing a PI.
 513 # @defreturn Element
 514
 515 def ProcessingInstruction(target, text=None):
 516     element = Element(ProcessingInstruction)
 517     element.text = target
 518     if text:
 519         element.text = element.text + " " + text
 520     return element
 521
 522 PI = ProcessingInstruction
 523
 524 ##
 525 # QName wrapper.  This can be used to wrap a QName attribute value, in
 526 # order to get proper namespace handling on output.
 527 #
 528 # @param text A string containing the QName value, in the form {uri}local,
 529 #     or, if the tag argument is given, the URI part of a QName.
 530 # @param tag Optional tag.  If given, the first argument is interpreted as
 531 #     an URI, and this argument is interpreted as a local name.
 532 # @return An opaque object, representing the QName.
 533
 534 class QName:
 535     def __init__(self, text_or_uri, tag=None):
 536         if tag:
 537             text_or_uri = "{%s}%s" % (text_or_uri, tag)
 538         self.text = text_or_uri
 539     def __str__(self):
 540         return self.text
 541     def __hash__(self):
 542         return hash(self.text)
 543     def __cmp__(self, other):
 544         if isinstance(other, QName):
 545             return cmp(self.text, other.text)
 546         return cmp(self.text, other)
 547
 548 ##
 549 # ElementTree wrapper class.  This class represents an entire element
 550 # hierarchy, and adds some extra support for serialization to and from
 551 # standard XML.
 552 #
 553 # @param element Optional root element.
 554 # @keyparam file Optional file handle or name.  If given, the
 555 #     tree is initialized with the contents of this XML file.
 556
 557 class ElementTree:
 558
 559     def __init__(self, element=None, file=None):
 560         assert element is None or iselement(element)
 561         self._root = element # first node
 562         if file:
 563             self.parse(file)
 564
 565     ##
 566     # Gets the root element for this tree.
 567     #
 568     # @return An element instance.
 569     # @defreturn Element
 570
 571     def getroot(self):
 572         return self._root
 573
 574     ##
 575     # Replaces the root element for this tree.  This discards the
 576     # current contents of the tree, and replaces it with the given
 577     # element.  Use with care.
 578     #
 579     # @param element An element instance.
 580
 581     def _setroot(self, element):
 582         assert iselement(element)
 583         self._root = element
 584
 585     ##
 586     # Loads an external XML document into this element tree.
 587     #
 588     # @param source A file name or file object.
 589     # @param parser An optional parser instance.  If not given, the
 590     #     standard {@link XMLTreeBuilder} parser is used.
 591     # @return The document root element.
 592     # @defreturn Element
 593
 594     def parse(self, source, parser=None):
 595         if not hasattr(source, "read"):
 596             source = open(source, "rb")
 597         if not parser:
 598             parser = XMLTreeBuilder()
 599         while 1:
 600             data = source.read(32768)
 601             if not data:
 602                 break
 603             parser.feed(data)
 604         self._root = parser.close()
 605         return self._root
 606
 607     ##
 608     # Creates a tree iterator for the root element.  The iterator loops
 609     # over all elements in this tree, in document order.
 610     #
 611     # @param tag What tags to look for (default is to return all elements)
 612     # @return An iterator.
 613     # @defreturn iterator
 614
 615     def getiterator(self, tag=None):
 616         assert self._root is not None
 617         return self._root.getiterator(tag)
 618
 619     ##
 620     # Finds the first toplevel element with given tag.
 621     # Same as getroot().find(path).
 622     #
 623     # @param path What element to look for.
 624     # @return The first matching element, or None if no element was found.
 625     # @defreturn Element or None
 626
 627     def find(self, path):
 628         assert self._root is not None
 629         if path[:1] == "/":
 630             path = "." + path
 631         return self._root.find(path)
 632
 633     ##
 634     # Finds the element text for the first toplevel element with given
 635     # tag.  Same as getroot().findtext(path).
 636     #
 637     # @param path What toplevel element to look for.
 638     # @param default What to return if the element was not found.
 639     # @return The text content of the first matching element, or the
 640     #     default value no element was found.  Note that if the element
 641     #     has is found, but has no text content, this method returns an
 642     #     empty string.
 643     # @defreturn string
 644
 645     def findtext(self, path, default=None):
 646         assert self._root is not None
 647         if path[:1] == "/":
 648             path = "." + path
 649         return self._root.findtext(path, default)
 650
 651     ##
 652     # Finds all toplevel elements with the given tag.
 653     # Same as getroot().findall(path).
 654     #
 655     # @param path What element to look for.
 656     # @return A list or iterator containing all matching elements,
 657     #    in document order.
 658     # @defreturn list of Element instances
 659
 660     def findall(self, path):
 661         assert self._root is not None
 662         if path[:1] == "/":
 663             path = "." + path
 664         return self._root.findall(path)
 665
 666     ##
 667     # Writes the element tree to a file, as XML.
 668     #
 669     # @param file A file name, or a file object opened for writing.
 670     # @param encoding Optional output encoding (default is US-ASCII).
 671
 672     def write(self, file, encoding="us-ascii"):
 673         assert self._root is not None
 674         if not hasattr(file, "write"):
 675             file = open(file, "wb")
 676         if not encoding:
 677             encoding = "us-ascii"
 678         elif encoding != "utf-8" and encoding != "us-ascii":
 679             file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
 680         self._write(file, self._root, encoding, {})
 681
 682     def _write(self, file, node, encoding, namespaces, margin=0):
 683         # write XML to file
 684         tag = node.tag
 685         if tag is Comment:
 686             file.write("<!-- %s -->\n" % _escape_cdata(node.text, encoding))
 687         elif tag is ProcessingInstruction:
 688             file.write("<?%s?>\n" % _escape_cdata(node.text, encoding))
 689         else:
 690             items = node.items()
 691             xmlns_items = [] # new namespaces in this scope
 692             try:
 693                 if isinstance(tag, QName) or tag[:1] == "{":
 694                     tag, xmlns = fixtag(tag, namespaces)
 695                     if xmlns: xmlns_items.append(xmlns)
 696             except TypeError:
 697                 _raise_serialization_error(tag)
 698             file.write(' ' * margin)
 699             file.write(_encode("<", encoding) + _encode(tag, encoding))
 700             if items or xmlns_items:
 701                 try:
 702                     items = sorted(items) # lexical order
 703                 except:
 704                     print("*** problem sorting items", items)
 705                 for k, v in items:
 706                     try:
 707                         if isinstance(k, QName) or k[:1] == "{":
 708                             k, xmlns = fixtag(k, namespaces)
 709                             if xmlns: xmlns_items.append(xmlns)
 710                     except TypeError:
 711                         _raise_serialization_error(k)
 712                     try:
 713                         if isinstance(v, QName):
 714                             v, xmlns = fixtag(v, namespaces)
 715                             if xmlns: xmlns_items.append(xmlns)
 716                     except TypeError:
 717                         _raise_serialization_error(v)
 718                     file.write(" %s=\"%s\"" % (k,v))
 719                 for k, v in xmlns_items:
 720                     file.write(" %s=\"%s\"" % (k,v))
 721             if node.text or len(node):
 722                 file.write(">")
 723                 if node.text:
 724                     file.write(_escape_cdata(node.text, encoding))
 725                 if len(node) > 0: file.write("\n")
 726                 for n in node:
 727                     self._write(file, n, encoding, namespaces, margin + 2)
 728                 if len(node) > 0: file.write(' ' * margin)
 729                 file.write(_encode("</", encoding) + _encode(tag, encoding) + _encode(">\n", encoding))
 730             else:
 731                 file.write("/>\n")
 732             for k, v in xmlns_items:
 733                 del namespaces[v]
 734         if node.tail:
 735             file.write(_escape_cdata(node.tail, encoding))
 736
 737 # --------------------------------------------------------------------
 738 # helpers
 739
 740 ##
 741 # Checks if an object appears to be a valid element object.
 742 #
 743 # @param An element instance.
 744 # @return A true value if this is an element object.
 745 # @defreturn flag
 746
 747 def iselement(element):
 748     # FIXME: not sure about this; might be a better idea to look
 749     # for tag/attrib/text attributes
 750     return isinstance(element, _ElementInterface) or hasattr(element, "tag")
 751
 752 ##
 753 # Writes an element tree or element structure to sys.stdout.  This
 754 # function should be used for debugging only.
 755 # <p>
 756 # The exact output format is implementation dependent.  In this
 757 # version, it's written as an ordinary XML file.
 758 #
 759 # @param elem An element tree or an individual element.
 760
 761 def dump(elem):
 762     # debugging
 763     if not isinstance(elem, ElementTree):
 764         elem = ElementTree(elem)
 765     elem.write(sys.stdout)
 766     tail = elem.getroot().tail
 767     if not tail or tail[-1] != "\n":
 768         sys.stdout.write("\n")
 769
 770 def _encode(s, encoding):
 771     try:
 772         return s.encode(encoding)
 773     except AttributeError:
 774         return s # 1.5.2: assume the string uses the right encoding
 775
 776 if sys.version[:3] == "1.5":
 777     _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
 778 else:
 779     _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
 780
 781 _escape_map = {
 782     "&": "&amp;",
 783     "<": "&lt;",
 784     ">": "&gt;",
 785     '"': "&quot;",
 786 }
 787
 788 _namespace_map = {
 789     # "well-known" namespace prefixes
 790     "http://www.w3.org/XML/1998/namespace": "xml",
 791     "http://www.w3.org/1999/xhtml": "html",
 792     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
 793     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
 794 }
 795
 796 def _raise_serialization_error(text):
 797     raise TypeError(
 798         "cannot serialize %r (type %s)" % (text, type(text).__name__)
 799         )
 800
 801 def _encode_entity(text, pattern=_escape):
 802     # map reserved and non-ascii characters to numerical entities
 803     def escape_entities(m, map=_escape_map):
 804         out = []
 805         append = out.append
 806         for char in m.group():
 807             text = map.get(char)
 808             if text is None:
 809                 text = "&#%d;" % ord(char)
 810             append(text)
 811         return string.join(out, "")
 812     try:
 813         return _encode(pattern.sub(escape_entities, text), "ascii")
 814     except TypeError:
 815         _raise_serialization_error(text)
 816
 817 #
 818 # the following functions assume an ascii-compatible encoding
 819 # (or "utf-16")
 820
 821 def _escape_cdata(text, encoding=None, replace=str.replace):
 822     # escape character data
 823     try:
 824         if platform.python_version()[0] == '2': # python 2.x.y
 825             if encoding:
 826                 try:
 827                     text = _encode(text, encoding)
 828                 except UnicodeError:
 829                     return _encode_entity(text)
 830
 831         text = replace(text, "&", "&amp;")
 832         text = replace(text, "<", "&lt;")
 833         text = replace(text, ">", "&gt;")
 834         text = replace(text, "####newLine####", "<br \>")
 835         if encoding:
 836             try:
 837                 text = _encode(text, encoding)
 838             except UnicodeError:
 839                 return _encode_entity(text)
 840         return text
 841     except (TypeError, AttributeError):
 842         _raise_serialization_error(text)
 843
 844 def _escape_attrib(text, encoding=None, replace=str.replace):
 845     # escape attribute value
 846     try:
 847         text = replace(text, "&", "&amp;")
 848         text = replace(text, "'", "&apos;") # FIXME: overkill
 849         text = replace(text, "\"", "&quot;")
 850         text = replace(text, "<", "&lt;")
 851         text = replace(text, ">", "&gt;")
 852         if encoding:
 853             try:
 854                 text = _encode(text, encoding)
 855             except UnicodeError:
 856                 return _encode_entity(text)
 857         return text
 858     except (TypeError, AttributeError):
 859         _raise_serialization_error(text)
 860
 861 def fixtag(tag, namespaces):
 862     # given a decorated tag (of the form {uri}tag), return prefixed
 863     # tag and namespace declaration, if any
 864     if isinstance(tag, QName):
 865         tag = tag.text
 866     namespace_uri, tag = string.split(tag[1:], "}", 1)
 867     prefix = namespaces.get(namespace_uri)
 868     if prefix is None:
 869         prefix = _namespace_map.get(namespace_uri)
 870         if prefix is None:
 871             prefix = "ns%d" % len(namespaces)
 872         namespaces[namespace_uri] = prefix
 873         if prefix == "xml":
 874             xmlns = None
 875         else:
 876             xmlns = ("xmlns:%s" % prefix, namespace_uri)
 877     else:
 878         xmlns = None
 879     return "%s:%s" % (prefix, tag), xmlns
 880
 881 ##
 882 # Parses an XML document into an element tree.
 883 #
 884 # @param source A filename or file object containing XML data.
 885 # @param parser An optional parser instance.  If not given, the
 886 #     standard {@link XMLTreeBuilder} parser is used.
 887 # @return An ElementTree instance
 888
 889 def parse(source, parser=None):
 890     tree = ElementTree()
 891     tree.parse(source, parser)
 892     return tree
 893
 894 ##
 895 # Parses an XML document into an element tree incrementally, and reports
 896 # what's going on to the user.
 897 #
 898 # @param source A filename or file object containing XML data.
 899 # @param events A list of events to report back.  If omitted, only "end"
 900 #     events are reported.
 901 # @return A (event, elem) iterator.
 902
 903 class iterparse:
 904
 905     def __init__(self, source, events=None):
 906         if not hasattr(source, "read"):
 907             # OP TEST
 908             print("iterparse.__init__ source = %s" % source)
 909             source = open(source, "rb")
 910         self._file = source
 911         self._events = []
 912         self._index = 0
 913         self.root = self._root = None
 914         self._parser = XMLTreeBuilder()
 915         # wire up the parser for event reporting
 916         parser = self._parser._parser
 917         append = self._events.append
 918         if events is None:
 919             events = ["end"]
 920         for event in events:
 921             if event == "start":
 922                 try:
 923                     parser.ordered_attributes = 1
 924                     parser.specified_attributes = 1
 925                     def handler(tag, attrib_in, event=event, append=append,
 926                                 start=self._parser._start_list):
 927                         append((event, start(tag, attrib_in)))
 928                     parser.StartElementHandler = handler
 929                 except AttributeError:
 930                     def handler(tag, attrib_in, event=event, append=append,
 931                                 start=self._parser._start):
 932                         append((event, start(tag, attrib_in)))
 933                     parser.StartElementHandler = handler
 934             elif event == "end":
 935                 def handler(tag, event=event, append=append,
 936                             end=self._parser._end):
 937                     append((event, end(tag)))
 938                 parser.EndElementHandler = handler
 939             elif event == "start-ns":
 940                 def handler(prefix, uri, event=event, append=append):
 941                     try:
 942                         uri = _encode(uri, "ascii")
 943                     except UnicodeError:
 944                         pass
 945                     append((event, (prefix or "", uri)))
 946                 parser.StartNamespaceDeclHandler = handler
 947             elif event == "end-ns":
 948                 def handler(prefix, event=event, append=append):
 949                     append((event, None))
 950                 parser.EndNamespaceDeclHandler = handler
 951
 952     def next(self):
 953         while 1:
 954             try:
 955                 item = self._events[self._index]
 956             except IndexError:
 957                 if self._parser is None:
 958                     self.root = self._root
 959                     try:
 960                         raise StopIteration
 961                     except NameError:
 962                         raise IndexError
 963                 # load event buffer
 964                 del self._events[:]
 965                 self._index = 0
 966                 data = self._file.read(16384)
 967                 if data:
 968                     self._parser.feed(data)
 969                 else:
 970                     self._root = self._parser.close()
 971                     self._parser = None
 972             else:
 973                 self._index = self._index + 1
 974                 return item
 975
 976     try:
 977         iter
 978         def __iter__(self):
 979             return self
 980     except NameError:
 981         def __getitem__(self, index):
 982             return self.next()
 983
 984 ##
 985 # Parses an XML document from a string constant.  This function can
 986 # be used to embed "XML literals" in Python code.
 987 #
 988 # @param source A string containing XML data.
 989 # @return An Element instance.
 990 # @defreturn Element
 991
 992 def XML(text):
 993     parser = XMLTreeBuilder()
 994     parser.feed(text)
 995     return parser.close()
 996
 997 ##
 998 # Parses an XML document from a string constant, and also returns
 999 # a dictionary which maps from element id:s to elements.
1000 #
1001 # @param source A string containing XML data.
1002 # @return A tuple containing an Element instance and a dictionary.
1003 # @defreturn (Element, dictionary)
1004
1005 def XMLID(text):
1006     parser = XMLTreeBuilder()
1007     parser.feed(text)
1008     tree = parser.close()
1009     ids = {}
1010     for elem in tree.getiterator():
1011         id = elem.get("id")
1012         if id:
1013             ids[id] = elem
1014     return tree, ids
1015
1016 ##
1017 # Parses an XML document from a string constant.  Same as {@link #XML}.
1018 #
1019 # @def fromstring(text)
1020 # @param source A string containing XML data.
1021 # @return An Element instance.
1022 # @defreturn Element
1023
1024 fromstring = XML
1025
1026 ##
1027 # Generates a string representation of an XML element, including all
1028 # subelements.
1029 #
1030 # @param element An Element instance.
1031 # @return An encoded string containing the XML data.
1032 # @defreturn string
1033
1034 def tostring(element, encoding=None):
1035     class dummy:
1036         pass
1037     data = []
1038     file = dummy()
1039     file.write = data.append
1040     ElementTree(element).write(file, encoding)
1041     data2 = []
1042     for item in data:
1043         if isinstance(item, bytes):
1044             item = item.decode()
1045         data2.append(item)
1046     return "".join(data2)
1047
1048 ##
1049 # Generic element structure builder.  This builder converts a sequence
1050 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1051 # #TreeBuilder.end} method calls to a well-formed element structure.
1052 # <p>
1053 # You can use this class to build an element structure using a custom XML
1054 # parser, or a parser for some other XML-like format.
1055 #
1056 # @param element_factory Optional element factory.  This factory
1057 #    is called to create new Element instances, as necessary.
1058
1059 class TreeBuilder:
1060
1061     def __init__(self, element_factory=None):
1062         self._data = [] # data collector
1063         self._elem = [] # element stack
1064         self._last = None # last element
1065         self._tail = None # true if we're after an end tag
1066         if element_factory is None:
1067             element_factory = _ElementInterface
1068         self._factory = element_factory
1069
1070     ##
1071     # Flushes the parser buffers, and returns the toplevel documen
1072     # element.
1073     #
1074     # @return An Element instance.
1075     # @defreturn Element
1076
1077     def close(self):
1078         assert len(self._elem) == 0, "missing end tags"
1079         assert self._last != None, "missing toplevel element"
1080         return self._last
1081
1082     def _flush(self):
1083         if self._data:
1084             if self._last is not None:
1085                 text = ""
1086                 for item in self._data:
1087                     try:
1088                         text += item
1089                     except:
1090                         text += item.decode()
1091                 if self._tail:
1092                     assert self._last.tail is None, "internal error (tail)"
1093                     self._last.tail = text
1094                 else:
1095                     assert self._last.text is None, "internal error (text)"
1096                     self._last.text = text
1097             self._data = []
1098
1099     ##
1100     # Adds text to the current element.
1101     #
1102     # @param data A string.  This should be either an 8-bit string
1103     #    containing ASCII text, or a Unicode string.
1104
1105     def data(self, data):
1106         self._data.append(data)
1107
1108     ##
1109     # Opens a new element.
1110     #
1111     # @param tag The element name.
1112     # @param attrib A dictionary containing element attributes.
1113     # @return The opened element.
1114     # @defreturn Element
1115
1116     def start(self, tag, attrs):
1117         self._flush()
1118         self._last = elem = self._factory(tag, attrs)
1119         if self._elem:
1120             self._elem[-1].append(elem)
1121         self._elem.append(elem)
1122         self._tail = 0
1123         return elem
1124
1125     ##
1126     # Closes the current element.
1127     #
1128     # @param tag The element name.
1129     # @return The closed element.
1130     # @defreturn Element
1131
1132     def end(self, tag):
1133         self._flush()
1134         self._last = self._elem.pop()
1135         assert self._last.tag == tag,\
1136                "end tag mismatch (expected %s, got %s)" % (
1137                    self._last.tag, tag)
1138         self._tail = 1
1139         return self._last
1140
1141 ##
1142 # Element structure builder for XML source data, based on the
1143 # <b>expat</b> parser.
1144 #
1145 # @keyparam target Target object.  If omitted, the builder uses an
1146 #     instance of the standard {@link #TreeBuilder} class.
1147 # @keyparam html Predefine HTML entities.  This flag is not supported
1148 #     by the current implementation.
1149 # @see #ElementTree
1150 # @see #TreeBuilder
1151
1152 class XMLTreeBuilder:
1153
1154     def __init__(self, html=0, target=None):
1155         try:
1156             from xml.parsers import expat
1157         except ImportError:
1158             raise ImportError(
1159                 "No module named expat; use SimpleXMLTreeBuilder instead"
1160                 )
1161         self._parser = parser = expat.ParserCreate(None, "}")
1162         if target is None:
1163             target = TreeBuilder()
1164         self._target = target
1165         self._names = {} # name memo cache
1166         # callbacks
1167         parser.DefaultHandlerExpand = self._default
1168         parser.StartElementHandler = self._start
1169         parser.EndElementHandler = self._end
1170         parser.CharacterDataHandler = self._data
1171         # let expat do the buffering, if supported
1172         try:
1173             self._parser.buffer_text = 1
1174         except AttributeError:
1175             pass
1176         # use new-style attribute handling, if supported
1177         try:
1178             self._parser.ordered_attributes = 1
1179             self._parser.specified_attributes = 1
1180             parser.StartElementHandler = self._start_list
1181         except AttributeError:
1182             pass
1183         #encoding = None
1184         #if not parser.returns_unicode:
1185         #    encoding = "utf-8"
1186         # target.xml(encoding, None)
1187         self._doctype = None
1188         self.entity = {}
1189
1190     def _fixtext(self, text):
1191         # convert text string to ascii, if possible
1192         try:
1193             return _encode(text, "ascii")
1194         except UnicodeError:
1195             return text
1196
1197     def _fixname(self, key):
1198         # expand qname, and convert name string to ascii, if possible
1199         try:
1200             name = self._names[key]
1201         except KeyError:
1202             name = key
1203             if "}" in name:
1204                 name = "{" + name
1205             self._names[key] = name = self._fixtext(name)
1206         return name
1207
1208     def _start(self, tag, attrib_in):
1209         fixname = self._fixname
1210         tag = fixname(tag)
1211         attrib = {}
1212         for key, value in attrib_in.items():
1213             attrib[fixname(key)] = self._fixtext(value)
1214         return self._target.start(tag, attrib)
1215
1216     def _start_list(self, tag, attrib_in):
1217         fixname = self._fixname
1218         tag = fixname(tag)
1219         attrib = {}
1220         if attrib_in:
1221             for i in range(0, len(attrib_in), 2):
1222                 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1223         return self._target.start(tag, attrib)
1224
1225     def _data(self, text):
1226         return self._target.data(self._fixtext(text))
1227
1228     def _end(self, tag):
1229         return self._target.end(self._fixname(tag))
1230
1231     def _default(self, text):
1232         prefix = text[:1]
1233         if prefix == "&":
1234             # deal with undefined entities
1235             try:
1236                 self._target.data(self.entity[text[1:-1]])
1237             except KeyError:
1238                 from xml.parsers import expat
1239                 raise expat.error(
1240                     "undefined entity %s: line %d, column %d" %
1241                     (text, self._parser.ErrorLineNumber,
1242                     self._parser.ErrorColumnNumber)
1243                     )
1244         elif prefix == "<" and text[:9] == "<!DOCTYPE":
1245             self._doctype = [] # inside a doctype declaration
1246         elif self._doctype is not None:
1247             # parse doctype contents
1248             if prefix == ">":
1249                 self._doctype = None
1250                 return
1251             text = string.strip(text)
1252             if not text:
1253                 return
1254             self._doctype.append(text)
1255             n = len(self._doctype)
1256             if n > 2:
1257                 type = self._doctype[1]
1258                 if type == "PUBLIC" and n == 4:
1259                     name, type, pubid, system = self._doctype
1260                 elif type == "SYSTEM" and n == 3:
1261                     name, type, system = self._doctype
1262                     pubid = None
1263                 else:
1264                     return
1265                 if pubid:
1266                     pubid = pubid[1:-1]
1267                 self.doctype(name, pubid, system[1:-1])
1268                 self._doctype = None
1269
1270     ##
1271     # Handles a doctype declaration.
1272     #
1273     # @param name Doctype name.
1274     # @param pubid Public identifier.
1275     # @param system System identifier.
1276
1277     def doctype(self, name, pubid, system):
1278         pass
1279
1280     ##
1281     # Feeds data to the parser.
1282     #
1283     # @param data Encoded data.
1284
1285     def feed(self, data):
1286         """
1287         my_str = "hello world"
1288         my_str_as_bytes = str.encode(my_str)
1289         type(my_str_as_bytes) # ensure it is byte representation
1290         my_decoded_str = my_str_as_bytes.decode()
1291         type(my_decoded_str) # ensure it is string representation
1292         """
1293         try:
1294             self._parser.Parse(data, 0)
1295         except:
1296             print("*** problem feed:\n%s" % data.decode('utf-8'))
1297
1298     ##
1299     # Finishes feeding data to the parser.
1300     #
1301     # @return An element structure.
1302     # @defreturn Element
1303
1304     def close(self):
1305         self._parser.Parse("", 1) # end of data
1306         tree = self._target.close()
1307         del self._target, self._parser # get rid of circular references
1308         return tree