src/ElementTreePython2.py

   1 #
   2 # ElementTree
   3 # $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
   4 #
   5 # light-weight XML support for Python 1.5.2 and later.
   6 #
   7 # history:
   8 # 2001-10-20 fl   created (from various sources)
   9 # 2001-11-01 fl   return root from parse method
  10 # 2002-02-16 fl   sort attributes in lexical order
  11 # 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
  12 # 2002-05-01 fl   finished TreeBuilder refactoring
  13 # 2002-07-14 fl   added basic namespace support to ElementTree.write
  14 # 2002-07-25 fl   added QName attribute support
  15 # 2002-10-20 fl   fixed encoding in write
  16 # 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
  17 # 2002-11-27 fl   accept file objects or file names for parse/write
  18 # 2002-12-04 fl   moved XMLTreeBuilder back to this module
  19 # 2003-01-11 fl   fixed entity encoding glitch for us-ascii
  20 # 2003-02-13 fl   added XML literal factory
  21 # 2003-02-21 fl   added ProcessingInstruction/PI factory
  22 # 2003-05-11 fl   added tostring/fromstring helpers
  23 # 2003-05-26 fl   added ElementPath support
  24 # 2003-07-05 fl   added makeelement factory method
  25 # 2003-07-28 fl   added more well-known namespace prefixes
  26 # 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
  27 # 2003-09-04 fl   fall back on emulator if ElementPath is not installed
  28 # 2003-10-31 fl   markup updates
  29 # 2003-11-15 fl   fixed nested namespace bug
  30 # 2004-03-28 fl   added XMLID helper
  31 # 2004-06-02 fl   added default support to findtext
  32 # 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
  33 # 2004-08-23 fl   take advantage of post-2.1 expat features
  34 # 2005-02-01 fl   added iterparse implementation
  35 # 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
  36 #
  37 # Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
  38 #
  39 # fredrik@pythonware.com
  40 # http://www.pythonware.com
  41 #
  42 # --------------------------------------------------------------------
  43 # The ElementTree toolkit is
  44 #
  45 # Copyright (c) 1999-2005 by Fredrik Lundh
  46 #
  47 # By obtaining, using, and/or copying this software and/or its
  48 # associated documentation, you agree that you have read, understood,
  49 # and will comply with the following terms and conditions:
  50 #
  51 # Permission to use, copy, modify, and distribute this software and
  52 # its associated documentation for any purpose and without fee is
  53 # hereby granted, provided that the above copyright notice appears in
  54 # all copies, and that both that copyright notice and this permission
  55 # notice appear in supporting documentation, and that the name of
  56 # Secret Labs AB or the author not be used in advertising or publicity
  57 # pertaining to distribution of the software without specific, written
  58 # prior permission.
  59 #
  60 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  61 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  62 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  63 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  64 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  65 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  66 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  67 # OF THIS SOFTWARE.
  68 # --------------------------------------------------------------------
  69
  70 __all__ = [
  71     # public symbols
  72     "Comment",
  73     "dump",
  74     "Element",
  75     "ElementTree",
  76     "fromstring",
  77     "iselement",
  78     "iterparse",
  79     "parse",
  80     "PI",
  81     "ProcessingInstruction",
  82     "QName",
  83     "SubElement",
  84     "tostring",
  85     "TreeBuilder",
  86     "VERSION",
  87     "XML",
  88     "XMLTreeBuilder",
  89 ]
  90
  91 ##
  92 # The <b>Element</b> type is a flexible container object, designed to
  93 # store hierarchical data structures in memory. The type can be
  94 # described as a cross between a list and a dictionary.
  95 # <p>
  96 # Each element has a number of properties associated with it:
  97 # <ul>
  98 # <li>a <i>tag</i>. This is a string identifying what kind of data
  99 # this element represents (the element type, in other words).</li>
 100 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
 101 # <li>a <i>text</i> string.</li>
 102 # <li>an optional <i>tail</i> string.</li>
 103 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>
 104 # </ul>
 105 #
 106 # To create an element instance, use the {@link #Element} or {@link
 107 # #SubElement} factory functions.
 108 # <p>
 109 # The {@link #ElementTree} class can be used to wrap an element
 110 # structure, and convert it from and to XML.
 111 ##
 112
 113 import string, sys, re, platform
 114
 115
 116 class _SimpleElementPath:
 117     # emulate pre-1.2 find/findtext/findall behaviour
 118     def find(self, element, tag):
 119         for elem in element:
 120             if elem.tag == tag:
 121                 return elem
 122         return None
 123
 124     def findtext(self, element, tag, default=None):
 125         for elem in element:
 126             if elem.tag == tag:
 127                 return elem.text or ""
 128         return default
 129
 130     def findall(self, element, tag):
 131         if tag[:3] == ".//":
 132             return element.getiterator(tag[3:])
 133         result = []
 134         for elem in element:
 135             if elem.tag == tag:
 136                 result.append(elem)
 137         return result
 138
 139
 140 """
 141 # obsolete
 142 # ElementPath.py is for python3 2019
 143 # file inexisting in sat before 2019
 144 try:
 145     import ElementPath
 146 except ImportError:
 147     # FIXME: issue warning in this case?
 148     ElementPath = _SimpleElementPath()
 149 """
 150 ElementPath = _SimpleElementPath()  # before 2019 python2 situation sat5.0
 151
 152 # TODO: add support for custom namespace resolvers/default namespaces
 153 # TODO: add improved support for incremental parsing
 154
 155 VERSION = "1.2.6"
 156
 157 ##
 158 # Internal element class.  This class defines the Element interface,
 159 # and provides a reference implementation of this interface.
 160 # <p>
 161 # You should not create instances of this class directly.  Use the
 162 # appropriate factory functions instead, such as {@link #Element}
 163 # and {@link #SubElement}.
 164 #
 165 # @see Element
 166 # @see SubElement
 167 # @see Comment
 168 # @see ProcessingInstruction
 169
 170
 171 class _ElementInterface:
 172     # <tag attrib>text<child/>...</tag>tail
 173
 174     ##
 175     # (Attribute) Element tag.
 176
 177     tag = None
 178
 179     ##
 180     # (Attribute) Element attribute dictionary.  Where possible, use
 181     # {@link #_ElementInterface.get},
 182     # {@link #_ElementInterface.set},
 183     # {@link #_ElementInterface.keys}, and
 184     # {@link #_ElementInterface.items} to access
 185     # element attributes.
 186
 187     attrib = None
 188
 189     ##
 190     # (Attribute) Text before first subelement.  This is either a
 191     # string or the value None, if there was no text.
 192
 193     text = None
 194
 195     ##
 196     # (Attribute) Text after this element's end tag, but before the
 197     # next sibling element's start tag.  This is either a string or
 198     # the value None, if there was no text.
 199
 200     tail = None  # text after end tag, if any
 201
 202     def __init__(self, tag, attrib):
 203         self.tag = tag
 204         self.attrib = attrib
 205         self._children = []
 206
 207     def __repr__(self):
 208         return "<Element %s at %x>" % (self.tag, id(self))
 209
 210     ##
 211     # Creates a new element object of the same type as this element.
 212     #
 213     # @param tag Element tag.
 214     # @param attrib Element attributes, given as a dictionary.
 215     # @return A new element instance.
 216
 217     def makeelement(self, tag, attrib):
 218         return Element(tag, attrib)
 219
 220     ##
 221     # Returns the number of subelements.
 222     #
 223     # @return The number of subelements.
 224
 225     def __len__(self):
 226         return len(self._children)
 227
 228     ##
 229     # Returns the given subelement.
 230     #
 231     # @param index What subelement to return.
 232     # @return The given subelement.
 233     # @exception IndexError If the given element does not exist.
 234
 235     def __getitem__(self, index):
 236         return self._children[index]
 237
 238     ##
 239     # Replaces the given subelement.
 240     #
 241     # @param index What subelement to replace.
 242     # @param element The new element value.
 243     # @exception IndexError If the given element does not exist.
 244     # @exception AssertionError If element is not a valid object.
 245
 246     def __setitem__(self, index, element):
 247         assert iselement(element)
 248         self._children[index] = element
 249
 250     ##
 251     # Deletes the given subelement.
 252     #
 253     # @param index What subelement to delete.
 254     # @exception IndexError If the given element does not exist.
 255
 256     def __delitem__(self, index):
 257         del self._children[index]
 258
 259     ##
 260     # Returns a list containing subelements in the given range.
 261     #
 262     # @param start The first subelement to return.
 263     # @param stop The first subelement that shouldn't be returned.
 264     # @return A sequence object containing subelements.
 265
 266     def __getslice__(self, start, stop):
 267         return self._children[start:stop]
 268
 269     ##
 270     # Replaces a number of subelements with elements from a sequence.
 271     #
 272     # @param start The first subelement to replace.
 273     # @param stop The first subelement that shouldn't be replaced.
 274     # @param elements A sequence object with zero or more elements.
 275     # @exception AssertionError If a sequence member is not a valid object.
 276
 277     def __setslice__(self, start, stop, elements):
 278         for element in elements:
 279             assert iselement(element)
 280         self._children[start:stop] = list(elements)
 281
 282     ##
 283     # Deletes a number of subelements.
 284     #
 285     # @param start The first subelement to delete.
 286     # @param stop The first subelement to leave in there.
 287
 288     def __delslice__(self, start, stop):
 289         del self._children[start:stop]
 290
 291     ##
 292     # Adds a subelement to the end of this element.
 293     #
 294     # @param element The element to add.
 295     # @exception AssertionError If a sequence member is not a valid object.
 296
 297     def append(self, element):
 298         assert iselement(element)
 299         self._children.append(element)
 300
 301     ##
 302     # Inserts a subelement at the given position in this element.
 303     #
 304     # @param index Where to insert the new subelement.
 305     # @exception AssertionError If the element is not a valid object.
 306
 307     def insert(self, index, element):
 308         assert iselement(element)
 309         self._children.insert(index, element)
 310
 311     ##
 312     # Removes a matching subelement.  Unlike the <b>find</b> methods,
 313     # this method compares elements based on identity, not on tag
 314     # value or contents.
 315     #
 316     # @param element What element to remove.
 317     # @exception ValueError If a matching element could not be found.
 318     # @exception AssertionError If the element is not a valid object.
 319
 320     def remove(self, element):
 321         assert iselement(element)
 322         self._children.remove(element)
 323
 324     ##
 325     # Returns all subelements.  The elements are returned in document
 326     # order.
 327     #
 328     # @return A list of subelements.
 329     # @defreturn list of Element instances
 330
 331     def getchildren(self):
 332         return self._children
 333
 334     ##
 335     # Finds the first matching subelement, by tag name or path.
 336     #
 337     # @param path What element to look for.
 338     # @return The first matching element, or None if no element was found.
 339     # @defreturn Element or None
 340
 341     def find(self, path):
 342         if ElementPath.find(self, path) == None:
 343             return ElementPath.find(self, path.encode())
 344         return ElementPath.find(self, path)
 345
 346     ##
 347     # Finds text for the first matching subelement, by tag name or path.
 348     #
 349     # @param path What element to look for.
 350     # @param default What to return if the element was not found.
 351     # @return The text content of the first matching element, or the
 352     #     default value no element was found.  Note that if the element
 353     #     has is found, but has no text content, this method returns an
 354     #     empty string.
 355     # @defreturn string
 356
 357     def findtext(self, path, default=None):
 358         return ElementPath.findtext(self, path, default)
 359
 360     ##
 361     # Finds all matching subelements, by tag name or path.
 362     #
 363     # @param path What element to look for.
 364     # @return A list or iterator containing all matching elements,
 365     #    in document order.
 366     # @defreturn list of Element instances
 367
 368     def findall(self, path):
 369         return ElementPath.findall(self, path)
 370
 371     ##
 372     # Resets an element.  This function removes all subelements, clears
 373     # all attributes, and sets the text and tail attributes to None.
 374
 375     def clear(self):
 376         self.attrib.clear()
 377         self._children = []
 378         self.text = self.tail = None
 379
 380     ##
 381     # Gets an element attribute.
 382     #
 383     # @param key What attribute to look for.
 384     # @param default What to return if the attribute was not found.
 385     # @return The attribute value, or the default value, if the
 386     #     attribute was not found.
 387     # @defreturn string or None
 388
 389     def get(self, key, default=None):
 390         res = self.attrib.get(key, default)
 391         if not res:
 392             res = self.attrib.get(key.encode(), default)
 393         if isinstance(res, bytes):
 394             return res.decode()
 395         else:
 396             return res
 397
 398     ##
 399     # Sets an element attribute.
 400     #
 401     # @param key What attribute to set.
 402     # @param value The attribute value.
 403
 404     def set(self, key, value):
 405         self.attrib[key] = value
 406
 407     ##
 408     # Gets a list of attribute names.  The names are returned in an
 409     # arbitrary order (just like for an ordinary Python dictionary).
 410     #
 411     # @return A list of element attribute names.
 412     # @defreturn list of strings
 413
 414     def keys(self):
 415         res = []
 416         for key in self.attrib.keys():
 417             if isinstance(key, bytes):
 418                 res.append(key.decode())
 419             else:
 420                 res.append(key)
 421         return res
 422
 423     ##
 424     # Gets element attributes, as a sequence.  The attributes are
 425     # returned in an arbitrary order.
 426     #
 427     # @return A list of (name, value) tuples for all attributes.
 428     # @defreturn list of (string, string) tuples
 429
 430     def items(self):
 431         return self.attrib.items()
 432
 433     ##
 434     # Creates a tree iterator.  The iterator loops over this element
 435     # and all subelements, in document order, and returns all elements
 436     # with a matching tag.
 437     # <p>
 438     # If the tree structure is modified during iteration, the result
 439     # is undefined.
 440     #
 441     # @param tag What tags to look for (default is to return all elements).
 442     # @return A list or iterator containing all the matching elements.
 443     # @defreturn list or iterator
 444
 445     def getiterator(self, tag=None):
 446         nodes = []
 447         if tag == "*":
 448             tag = None
 449         if tag is None or self.tag == tag:
 450             nodes.append(self)
 451         for node in self._children:
 452             nodes.extend(node.getiterator(tag))
 453         return nodes
 454
 455
 456 # compatibility
 457 _Element = _ElementInterface
 458
 459 ##
 460 # Element factory.  This function returns an object implementing the
 461 # standard Element interface.  The exact class or type of that object
 462 # is implementation dependent, but it will always be compatible with
 463 # the {@link #_ElementInterface} class in this module.
 464 # <p>
 465 # The element name, attribute names, and attribute values can be
 466 # either 8-bit ASCII strings or Unicode strings.
 467 #
 468 # @param tag The element name.
 469 # @param attrib An optional dictionary, containing element attributes.
 470 # @param **extra Additional attributes, given as keyword arguments.
 471 # @return An element instance.
 472 # @defreturn Element
 473
 474
 475 def Element(tag, attrib={}, **extra):
 476     attrib = attrib.copy()
 477     attrib.update(extra)
 478     return _ElementInterface(tag, attrib)
 479
 480
 481 ##
 482 # Subelement factory.  This function creates an element instance, and
 483 # appends it to an existing element.
 484 # <p>
 485 # The element name, attribute names, and attribute values can be
 486 # either 8-bit ASCII strings or Unicode strings.
 487 #
 488 # @param parent The parent element.
 489 # @param tag The subelement name.
 490 # @param attrib An optional dictionary, containing element attributes.
 491 # @param **extra Additional attributes, given as keyword arguments.
 492 # @return An element instance.
 493 # @defreturn Element
 494
 495
 496 def SubElement(parent, tag, attrib={}, **extra):
 497     attrib = attrib.copy()
 498     attrib.update(extra)
 499     element = parent.makeelement(tag, attrib)
 500     parent.append(element)
 501     return element
 502
 503
 504 ##
 505 # Comment element factory.  This factory function creates a special
 506 # element that will be serialized as an XML comment.
 507 # <p>
 508 # The comment string can be either an 8-bit ASCII string or a Unicode
 509 # string.
 510 #
 511 # @param text A string containing the comment string.
 512 # @return An element instance, representing a comment.
 513 # @defreturn Element
 514
 515
 516 def Comment(text=None):
 517     element = Element(Comment)
 518     element.text = text
 519     return element
 520
 521
 522 ##
 523 # PI element factory.  This factory function creates a special element
 524 # that will be serialized as an XML processing instruction.
 525 #
 526 # @param target A string containing the PI target.
 527 # @param text A string containing the PI contents, if any.
 528 # @return An element instance, representing a PI.
 529 # @defreturn Element
 530
 531
 532 def ProcessingInstruction(target, text=None):
 533     element = Element(ProcessingInstruction)
 534     element.text = target
 535     if text:
 536         element.text = element.text + " " + text
 537     return element
 538
 539
 540 PI = ProcessingInstruction
 541
 542 ##
 543 # QName wrapper.  This can be used to wrap a QName attribute value, in
 544 # order to get proper namespace handling on output.
 545 #
 546 # @param text A string containing the QName value, in the form {uri}local,
 547 #     or, if the tag argument is given, the URI part of a QName.
 548 # @param tag Optional tag.  If given, the first argument is interpreted as
 549 #     an URI, and this argument is interpreted as a local name.
 550 # @return An opaque object, representing the QName.
 551
 552
 553 class QName:
 554     def __init__(self, text_or_uri, tag=None):
 555         if tag:
 556             text_or_uri = "{%s}%s" % (text_or_uri, tag)
 557         self.text = text_or_uri
 558
 559     def __str__(self):
 560         return self.text
 561
 562     def __hash__(self):
 563         return hash(self.text)
 564
 565     def __cmp__(self, other):
 566         if isinstance(other, QName):
 567             return cmp(self.text, other.text)
 568         return cmp(self.text, other)
 569
 570
 571 ##
 572 # ElementTree wrapper class.  This class represents an entire element
 573 # hierarchy, and adds some extra support for serialization to and from
 574 # standard XML.
 575 #
 576 # @param element Optional root element.
 577 # @keyparam file Optional file handle or name.  If given, the
 578 #     tree is initialized with the contents of this XML file.
 579
 580
 581 class ElementTree:
 582     def __init__(self, element=None, file=None):
 583         assert element is None or iselement(element)
 584         self._root = element  # first node
 585         if file:
 586             self.parse(file)
 587
 588     ##
 589     # Gets the root element for this tree.
 590     #
 591     # @return An element instance.
 592     # @defreturn Element
 593
 594     def getroot(self):
 595         return self._root
 596
 597     ##
 598     # Replaces the root element for this tree.  This discards the
 599     # current contents of the tree, and replaces it with the given
 600     # element.  Use with care.
 601     #
 602     # @param element An element instance.
 603
 604     def _setroot(self, element):
 605         assert iselement(element)
 606         self._root = element
 607
 608     ##
 609     # Loads an external XML document into this element tree.
 610     #
 611     # @param source A file name or file object.
 612     # @param parser An optional parser instance.  If not given, the
 613     #     standard {@link XMLTreeBuilder} parser is used.
 614     # @return The document root element.
 615     # @defreturn Element
 616
 617     def parse(self, source, parser=None):
 618         if not hasattr(source, "read"):
 619             source = open(source, "rb")
 620         if not parser:
 621             parser = XMLTreeBuilder()
 622         while 1:
 623             data = source.read(32768)
 624             if not data:
 625                 break
 626             parser.feed(data)
 627         self._root = parser.close()
 628         return self._root
 629
 630     ##
 631     # Creates a tree iterator for the root element.  The iterator loops
 632     # over all elements in this tree, in document order.
 633     #
 634     # @param tag What tags to look for (default is to return all elements)
 635     # @return An iterator.
 636     # @defreturn iterator
 637
 638     def getiterator(self, tag=None):
 639         assert self._root is not None
 640         return self._root.getiterator(tag)
 641
 642     ##
 643     # Finds the first toplevel element with given tag.
 644     # Same as getroot().find(path).
 645     #
 646     # @param path What element to look for.
 647     # @return The first matching element, or None if no element was found.
 648     # @defreturn Element or None
 649
 650     def find(self, path):
 651         assert self._root is not None
 652         if path[:1] == "/":
 653             path = "." + path
 654         return self._root.find(path)
 655
 656     ##
 657     # Finds the element text for the first toplevel element with given
 658     # tag.  Same as getroot().findtext(path).
 659     #
 660     # @param path What toplevel element to look for.
 661     # @param default What to return if the element was not found.
 662     # @return The text content of the first matching element, or the
 663     #     default value no element was found.  Note that if the element
 664     #     has is found, but has no text content, this method returns an
 665     #     empty string.
 666     # @defreturn string
 667
 668     def findtext(self, path, default=None):
 669         assert self._root is not None
 670         if path[:1] == "/":
 671             path = "." + path
 672         return self._root.findtext(path, default)
 673
 674     ##
 675     # Finds all toplevel elements with the given tag.
 676     # Same as getroot().findall(path).
 677     #
 678     # @param path What element to look for.
 679     # @return A list or iterator containing all matching elements,
 680     #    in document order.
 681     # @defreturn list of Element instances
 682
 683     def findall(self, path):
 684         assert self._root is not None
 685         if path[:1] == "/":
 686             path = "." + path
 687         return self._root.findall(path)
 688
 689     ##
 690     # Writes the element tree to a file, as XML.
 691     #
 692     # @param file A file name, or a file object opened for writing.
 693     # @param encoding Optional output encoding (default is US-ASCII).
 694
 695     def write(self, file, encoding="us-ascii"):
 696         assert self._root is not None
 697         if not hasattr(file, "write"):
 698             file = open(file, "wb")
 699         if not encoding:
 700             encoding = "us-ascii"
 701         elif encoding != "utf-8" and encoding != "us-ascii":
 702             file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
 703         self._write(file, self._root, encoding, {})
 704
 705     def _write(self, file, node, encoding, namespaces, margin=0):
 706         # write XML to file
 707         tag = node.tag
 708         if tag is Comment:
 709             file.write("<!-- %s -->\n" % _escape_cdata(node.text, encoding))
 710         elif tag is ProcessingInstruction:
 711             file.write("<?%s?>\n" % _escape_cdata(node.text, encoding))
 712         else:
 713             items = node.items()
 714             xmlns_items = []  # new namespaces in this scope
 715             try:
 716                 if isinstance(tag, QName) or tag[:1] == "{":
 717                     tag, xmlns = fixtag(tag, namespaces)
 718                     if xmlns:
 719                         xmlns_items.append(xmlns)
 720             except TypeError:
 721                 _raise_serialization_error(tag)
 722             file.write(" " * margin)
 723             file.write(_encode("<", encoding) + _encode(tag, encoding))
 724             if items or xmlns_items:
 725                 try:
 726                     items = sorted(items)  # lexical order
 727                 except:
 728                     print("*** problem sorting items", items)
 729                 for k, v in items:
 730                     try:
 731                         if isinstance(k, QName) or k[:1] == "{":
 732                             k, xmlns = fixtag(k, namespaces)
 733                             if xmlns:
 734                                 xmlns_items.append(xmlns)
 735                     except TypeError:
 736                         _raise_serialization_error(k)
 737                     try:
 738                         if isinstance(v, QName):
 739                             v, xmlns = fixtag(v, namespaces)
 740                             if xmlns:
 741                                 xmlns_items.append(xmlns)
 742                     except TypeError:
 743                         _raise_serialization_error(v)
 744                     file.write(' %s="%s"' % (k, v))
 745                 for k, v in xmlns_items:
 746                     file.write(' %s="%s"' % (k, v))
 747             if node.text or len(node):
 748                 file.write(">")
 749                 if node.text:
 750                     file.write(_escape_cdata(node.text, encoding))
 751                 if len(node) > 0:
 752                     file.write("\n")
 753                 for n in node:
 754                     self._write(file, n, encoding, namespaces, margin + 2)
 755                 if len(node) > 0:
 756                     file.write(" " * margin)
 757                 file.write(
 758                     _encode("</", encoding)
 759                     + _encode(tag, encoding)
 760                     + _encode(">\n", encoding)
 761                 )
 762             else:
 763                 file.write("/>\n")
 764             for k, v in xmlns_items:
 765                 del namespaces[v]
 766         if node.tail:
 767             file.write(_escape_cdata(node.tail, encoding))
 768
 769
 770 # --------------------------------------------------------------------
 771 # helpers
 772
 773 ##
 774 # Checks if an object appears to be a valid element object.
 775 #
 776 # @param An element instance.
 777 # @return A true value if this is an element object.
 778 # @defreturn flag
 779
 780
 781 def iselement(element):
 782     # FIXME: not sure about this; might be a better idea to look
 783     # for tag/attrib/text attributes
 784     return isinstance(element, _ElementInterface) or hasattr(element, "tag")
 785
 786
 787 ##
 788 # Writes an element tree or element structure to sys.stdout.  This
 789 # function should be used for debugging only.
 790 # <p>
 791 # The exact output format is implementation dependent.  In this
 792 # version, it's written as an ordinary XML file.
 793 #
 794 # @param elem An element tree or an individual element.
 795
 796
 797 def dump(elem):
 798     # debugging
 799     if not isinstance(elem, ElementTree):
 800         elem = ElementTree(elem)
 801     elem.write(sys.stdout)
 802     tail = elem.getroot().tail
 803     if not tail or tail[-1] != "\n":
 804         sys.stdout.write("\n")
 805
 806
 807 def _encode(s, encoding):
 808     try:
 809         return s.encode(encoding)
 810     except AttributeError:
 811         return s  # 1.5.2: assume the string uses the right encoding
 812
 813
 814 if sys.version[:3] == "1.5":
 815     _escape = re.compile(r"[&<>\"\x80-\xff]+")  # 1.5.2
 816 else:
 817     _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
 818
 819 _escape_map = {
 820     "&": "&amp;",
 821     "<": "&lt;",
 822     ">": "&gt;",
 823     '"': "&quot;",
 824 }
 825
 826 _namespace_map = {
 827     # "well-known" namespace prefixes
 828     "http://www.w3.org/XML/1998/namespace": "xml",
 829     "http://www.w3.org/1999/xhtml": "html",
 830     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
 831     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
 832 }
 833
 834
 835 def _raise_serialization_error(text):
 836     raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__))
 837
 838
 839 def _encode_entity(text, pattern=_escape):
 840     # map reserved and non-ascii characters to numerical entities
 841     def escape_entities(m, map=_escape_map):
 842         out = []
 843         append = out.append
 844         for char in m.group():
 845             text = map.get(char)
 846             if text is None:
 847                 text = "&#%d;" % ord(char)
 848             append(text)
 849         return string.join(out, "")
 850
 851     try:
 852         return _encode(pattern.sub(escape_entities, text), "ascii")
 853     except TypeError:
 854         _raise_serialization_error(text)
 855
 856
 857 #
 858 # the following functions assume an ascii-compatible encoding
 859 # (or "utf-16")
 860
 861
 862 def _escape_cdata(text, encoding=None, replace=str.replace):
 863     # escape character data
 864     try:
 865         if platform.python_version()[0] == "2":  # python 2.x.y
 866             if encoding:
 867                 try:
 868                     text = _encode(text, encoding)
 869                 except UnicodeError:
 870                     return _encode_entity(text)
 871
 872         text = replace(text, "&", "&amp;")
 873         text = replace(text, "<", "&lt;")
 874         text = replace(text, ">", "&gt;")
 875         text = replace(text, "####newLine####", "<br \>")
 876         if encoding:
 877             try:
 878                 text = _encode(text, encoding)
 879             except UnicodeError:
 880                 return _encode_entity(text)
 881         return text
 882     except (TypeError, AttributeError):
 883         _raise_serialization_error(text)
 884
 885
 886 def _escape_attrib(text, encoding=None, replace=str.replace):
 887     # escape attribute value
 888     try:
 889         text = replace(text, "&", "&amp;")
 890         text = replace(text, "'", "&apos;")  # FIXME: overkill
 891         text = replace(text, '"', "&quot;")
 892         text = replace(text, "<", "&lt;")
 893         text = replace(text, ">", "&gt;")
 894         if encoding:
 895             try:
 896                 text = _encode(text, encoding)
 897             except UnicodeError:
 898                 return _encode_entity(text)
 899         return text
 900     except (TypeError, AttributeError):
 901         _raise_serialization_error(text)
 902
 903
 904 def fixtag(tag, namespaces):
 905     # given a decorated tag (of the form {uri}tag), return prefixed
 906     # tag and namespace declaration, if any
 907     if isinstance(tag, QName):
 908         tag = tag.text
 909     namespace_uri, tag = string.split(tag[1:], "}", 1)
 910     prefix = namespaces.get(namespace_uri)
 911     if prefix is None:
 912         prefix = _namespace_map.get(namespace_uri)
 913         if prefix is None:
 914             prefix = "ns%d" % len(namespaces)
 915         namespaces[namespace_uri] = prefix
 916         if prefix == "xml":
 917             xmlns = None
 918         else:
 919             xmlns = ("xmlns:%s" % prefix, namespace_uri)
 920     else:
 921         xmlns = None
 922     return "%s:%s" % (prefix, tag), xmlns
 923
 924
 925 ##
 926 # Parses an XML document into an element tree.
 927 #
 928 # @param source A filename or file object containing XML data.
 929 # @param parser An optional parser instance.  If not given, the
 930 #     standard {@link XMLTreeBuilder} parser is used.
 931 # @return An ElementTree instance
 932
 933
 934 def parse(source, parser=None):
 935     tree = ElementTree()
 936     tree.parse(source, parser)
 937     return tree
 938
 939
 940 ##
 941 # Parses an XML document into an element tree incrementally, and reports
 942 # what's going on to the user.
 943 #
 944 # @param source A filename or file object containing XML data.
 945 # @param events A list of events to report back.  If omitted, only "end"
 946 #     events are reported.
 947 # @return A (event, elem) iterator.
 948
 949
 950 class iterparse:
 951     def __init__(self, source, events=None):
 952         if not hasattr(source, "read"):
 953             # OP TEST
 954             print("iterparse.__init__ source = %s" % source)
 955             source = open(source, "rb")
 956         self._file = source
 957         self._events = []
 958         self._index = 0
 959         self.root = self._root = None
 960         self._parser = XMLTreeBuilder()
 961         # wire up the parser for event reporting
 962         parser = self._parser._parser
 963         append = self._events.append
 964         if events is None:
 965             events = ["end"]
 966         for event in events:
 967             if event == "start":
 968                 try:
 969                     parser.ordered_attributes = 1
 970                     parser.specified_attributes = 1
 971
 972                     def handler(
 973                         tag,
 974                         attrib_in,
 975                         event=event,
 976                         append=append,
 977                         start=self._parser._start_list,
 978                     ):
 979                         append((event, start(tag, attrib_in)))
 980
 981                     parser.StartElementHandler = handler
 982                 except AttributeError:
 983
 984                     def handler(
 985                         tag,
 986                         attrib_in,
 987                         event=event,
 988                         append=append,
 989                         start=self._parser._start,
 990                     ):
 991                         append((event, start(tag, attrib_in)))
 992
 993                     parser.StartElementHandler = handler
 994             elif event == "end":
 995
 996                 def handler(tag, event=event, append=append, end=self._parser._end):
 997                     append((event, end(tag)))
 998
 999                 parser.EndElementHandler = handler
1000             elif event == "start-ns":
1001
1002                 def handler(prefix, uri, event=event, append=append):
1003                     try:
1004                         uri = _encode(uri, "ascii")
1005                     except UnicodeError:
1006                         pass
1007                     append((event, (prefix or "", uri)))
1008
1009                 parser.StartNamespaceDeclHandler = handler
1010             elif event == "end-ns":
1011
1012                 def handler(prefix, event=event, append=append):
1013                     append((event, None))
1014
1015                 parser.EndNamespaceDeclHandler = handler
1016
1017     def next(self):
1018         while 1:
1019             try:
1020                 item = self._events[self._index]
1021             except IndexError:
1022                 if self._parser is None:
1023                     self.root = self._root
1024                     try:
1025                         raise StopIteration
1026                     except NameError:
1027                         raise IndexError
1028                 # load event buffer
1029                 del self._events[:]
1030                 self._index = 0
1031                 data = self._file.read(16384)
1032                 if data:
1033                     self._parser.feed(data)
1034                 else:
1035                     self._root = self._parser.close()
1036                     self._parser = None
1037             else:
1038                 self._index = self._index + 1
1039                 return item
1040
1041     try:
1042         iter
1043
1044         def __iter__(self):
1045             return self
1046
1047     except NameError:
1048
1049         def __getitem__(self, index):
1050             return self.next()
1051
1052
1053 ##
1054 # Parses an XML document from a string constant.  This function can
1055 # be used to embed "XML literals" in Python code.
1056 #
1057 # @param source A string containing XML data.
1058 # @return An Element instance.
1059 # @defreturn Element
1060
1061
1062 def XML(text):
1063     parser = XMLTreeBuilder()
1064     parser.feed(text)
1065     return parser.close()
1066
1067
1068 ##
1069 # Parses an XML document from a string constant, and also returns
1070 # a dictionary which maps from element id:s to elements.
1071 #
1072 # @param source A string containing XML data.
1073 # @return A tuple containing an Element instance and a dictionary.
1074 # @defreturn (Element, dictionary)
1075
1076
1077 def XMLID(text):
1078     parser = XMLTreeBuilder()
1079     parser.feed(text)
1080     tree = parser.close()
1081     ids = {}
1082     for elem in tree.getiterator():
1083         id = elem.get("id")
1084         if id:
1085             ids[id] = elem
1086     return tree, ids
1087
1088
1089 ##
1090 # Parses an XML document from a string constant.  Same as {@link #XML}.
1091 #
1092 # @def fromstring(text)
1093 # @param source A string containing XML data.
1094 # @return An Element instance.
1095 # @defreturn Element
1096
1097 fromstring = XML
1098
1099 ##
1100 # Generates a string representation of an XML element, including all
1101 # subelements.
1102 #
1103 # @param element An Element instance.
1104 # @return An encoded string containing the XML data.
1105 # @defreturn string
1106
1107
1108 def tostring(element, encoding=None):
1109     class dummy:
1110         pass
1111
1112     data = []
1113     file = dummy()
1114     file.write = data.append
1115     ElementTree(element).write(file, encoding)
1116     data2 = []
1117     for item in data:
1118         if isinstance(item, bytes):
1119             item = item.decode()
1120         data2.append(item)
1121     return "".join(data2)
1122
1123
1124 ##
1125 # Generic element structure builder.  This builder converts a sequence
1126 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1127 # #TreeBuilder.end} method calls to a well-formed element structure.
1128 # <p>
1129 # You can use this class to build an element structure using a custom XML
1130 # parser, or a parser for some other XML-like format.
1131 #
1132 # @param element_factory Optional element factory.  This factory
1133 #    is called to create new Element instances, as necessary.
1134
1135
1136 class TreeBuilder:
1137     def __init__(self, element_factory=None):
1138         self._data = []  # data collector
1139         self._elem = []  # element stack
1140         self._last = None  # last element
1141         self._tail = None  # true if we're after an end tag
1142         if element_factory is None:
1143             element_factory = _ElementInterface
1144         self._factory = element_factory
1145
1146     ##
1147     # Flushes the parser buffers, and returns the toplevel documen
1148     # element.
1149     #
1150     # @return An Element instance.
1151     # @defreturn Element
1152
1153     def close(self):
1154         assert len(self._elem) == 0, "missing end tags"
1155         assert self._last != None, "missing toplevel element"
1156         return self._last
1157
1158     def _flush(self):
1159         if self._data:
1160             if self._last is not None:
1161                 text = ""
1162                 for item in self._data:
1163                     try:
1164                         text += item
1165                     except:
1166                         text += item.decode()
1167                 if self._tail:
1168                     assert self._last.tail is None, "internal error (tail)"
1169                     self._last.tail = text
1170                 else:
1171                     assert self._last.text is None, "internal error (text)"
1172                     self._last.text = text
1173             self._data = []
1174
1175     ##
1176     # Adds text to the current element.
1177     #
1178     # @param data A string.  This should be either an 8-bit string
1179     #    containing ASCII text, or a Unicode string.
1180
1181     def data(self, data):
1182         self._data.append(data)
1183
1184     ##
1185     # Opens a new element.
1186     #
1187     # @param tag The element name.
1188     # @param attrib A dictionary containing element attributes.
1189     # @return The opened element.
1190     # @defreturn Element
1191
1192     def start(self, tag, attrs):
1193         self._flush()
1194         self._last = elem = self._factory(tag, attrs)
1195         if self._elem:
1196             self._elem[-1].append(elem)
1197         self._elem.append(elem)
1198         self._tail = 0
1199         return elem
1200
1201     ##
1202     # Closes the current element.
1203     #
1204     # @param tag The element name.
1205     # @return The closed element.
1206     # @defreturn Element
1207
1208     def end(self, tag):
1209         self._flush()
1210         self._last = self._elem.pop()
1211         assert self._last.tag == tag, "end tag mismatch (expected %s, got %s)" % (
1212             self._last.tag,
1213             tag,
1214         )
1215         self._tail = 1
1216         return self._last
1217
1218
1219 ##
1220 # Element structure builder for XML source data, based on the
1221 # <b>expat</b> parser.
1222 #
1223 # @keyparam target Target object.  If omitted, the builder uses an
1224 #     instance of the standard {@link #TreeBuilder} class.
1225 # @keyparam html Predefine HTML entities.  This flag is not supported
1226 #     by the current implementation.
1227 # @see #ElementTree
1228 # @see #TreeBuilder
1229
1230
1231 class XMLTreeBuilder:
1232     def __init__(self, html=0, target=None):
1233         try:
1234             from xml.parsers import expat
1235         except ImportError:
1236             raise ImportError("No module named expat; use SimpleXMLTreeBuilder instead")
1237         self._parser = parser = expat.ParserCreate(None, "}")
1238         if target is None:
1239             target = TreeBuilder()
1240         self._target = target
1241         self._names = {}  # name memo cache
1242         # callbacks
1243         parser.DefaultHandlerExpand = self._default
1244         parser.StartElementHandler = self._start
1245         parser.EndElementHandler = self._end
1246         parser.CharacterDataHandler = self._data
1247         # let expat do the buffering, if supported
1248         try:
1249             self._parser.buffer_text = 1
1250         except AttributeError:
1251             pass
1252         # use new-style attribute handling, if supported
1253         try:
1254             self._parser.ordered_attributes = 1
1255             self._parser.specified_attributes = 1
1256             parser.StartElementHandler = self._start_list
1257         except AttributeError:
1258             pass
1259         # encoding = None
1260         # if not parser.returns_unicode:
1261         #    encoding = "utf-8"
1262         # target.xml(encoding, None)
1263         self._doctype = None
1264         self.entity = {}
1265
1266     def _fixtext(self, text):
1267         # convert text string to ascii, if possible
1268         try:
1269             return _encode(text, "ascii")
1270         except UnicodeError:
1271             return text
1272
1273     def _fixname(self, key):
1274         # expand qname, and convert name string to ascii, if possible
1275         try:
1276             name = self._names[key]
1277         except KeyError:
1278             name = key
1279             if "}" in name:
1280                 name = "{" + name
1281             self._names[key] = name = self._fixtext(name)
1282         return name
1283
1284     def _start(self, tag, attrib_in):
1285         fixname = self._fixname
1286         tag = fixname(tag)
1287         attrib = {}
1288         for key, value in attrib_in.items():
1289             attrib[fixname(key)] = self._fixtext(value)
1290         return self._target.start(tag, attrib)
1291
1292     def _start_list(self, tag, attrib_in):
1293         fixname = self._fixname
1294         tag = fixname(tag)
1295         attrib = {}
1296         if attrib_in:
1297             for i in range(0, len(attrib_in), 2):
1298                 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i + 1])
1299         return self._target.start(tag, attrib)
1300
1301     def _data(self, text):
1302         return self._target.data(self._fixtext(text))
1303
1304     def _end(self, tag):
1305         return self._target.end(self._fixname(tag))
1306
1307     def _default(self, text):
1308         prefix = text[:1]
1309         if prefix == "&":
1310             # deal with undefined entities
1311             try:
1312                 self._target.data(self.entity[text[1:-1]])
1313             except KeyError:
1314                 from xml.parsers import expat
1315
1316                 raise expat.error(
1317                     "undefined entity %s: line %d, column %d"
1318                     % (
1319                         text,
1320                         self._parser.ErrorLineNumber,
1321                         self._parser.ErrorColumnNumber,
1322                     )
1323                 )
1324         elif prefix == "<" and text[:9] == "<!DOCTYPE":
1325             self._doctype = []  # inside a doctype declaration
1326         elif self._doctype is not None:
1327             # parse doctype contents
1328             if prefix == ">":
1329                 self._doctype = None
1330                 return
1331             text = string.strip(text)
1332             if not text:
1333                 return
1334             self._doctype.append(text)
1335             n = len(self._doctype)
1336             if n > 2:
1337                 type = self._doctype[1]
1338                 if type == "PUBLIC" and n == 4:
1339                     name, type, pubid, system = self._doctype
1340                 elif type == "SYSTEM" and n == 3:
1341                     name, type, system = self._doctype
1342                     pubid = None
1343                 else:
1344                     return
1345                 if pubid:
1346                     pubid = pubid[1:-1]
1347                 self.doctype(name, pubid, system[1:-1])
1348                 self._doctype = None
1349
1350     ##
1351     # Handles a doctype declaration.
1352     #
1353     # @param name Doctype name.
1354     # @param pubid Public identifier.
1355     # @param system System identifier.
1356
1357     def doctype(self, name, pubid, system):
1358         pass
1359
1360     ##
1361     # Feeds data to the parser.
1362     #
1363     # @param data Encoded data.
1364
1365     def feed(self, data):
1366         """
1367         my_str = "hello world"
1368         my_str_as_bytes = str.encode(my_str)
1369         type(my_str_as_bytes) # ensure it is byte representation
1370         my_decoded_str = my_str_as_bytes.decode()
1371         type(my_decoded_str) # ensure it is string representation
1372         """
1373         try:
1374             self._parser.Parse(data, 0)
1375         except:
1376             print("*** problem feed:\n%s" % data.decode("utf-8"))
1377
1378     ##
1379     # Finishes feeding data to the parser.
1380     #
1381     # @return An element structure.
1382     # @defreturn Element
1383
1384     def close(self):
1385         self._parser.Parse("", 1)  # end of data
1386         tree = self._target.close()
1387         del self._target, self._parser  # get rid of circular references
1388         return tree