src/ElementTreePython3.py

   1 """Lightweight XML support for Python.
   2
   3  XML is an inherently hierarchical data format, and the most natural way to
   4  represent it is with a tree.  This module has two classes for this purpose:
   5
   6     1. ElementTree represents the whole XML document as a tree and
   7
   8     2. Element represents a single node in this tree.
   9
  10  Interactions with the whole document (reading and writing to/from files) are
  11  usually done on the ElementTree level.  Interactions with a single XML element
  12  and its sub-elements are done on the Element level.
  13
  14  Element is a flexible container object designed to store hierarchical data
  15  structures in memory. It can be described as a cross between a list and a
  16  dictionary.  Each Element has a number of properties associated with it:
  17
  18     'tag' - a string containing the element's name.
  19
  20     'attributes' - a Python dictionary storing the element's attributes.
  21
  22     'text' - a string containing the element's text content.
  23
  24     'tail' - an optional string containing text after the element's end tag.
  25
  26     And a number of child elements stored in a Python sequence.
  27
  28  To create an element instance, use the Element constructor,
  29  or the SubElement factory function.
  30
  31  You can also use the ElementTree class to wrap an element structure
  32  and convert it to and from XML.
  33
  34 """
  35
  36 # ---------------------------------------------------------------------
  37 # Licensed to PSF under a Contributor Agreement.
  38 # See http://www.python.org/psf/license for licensing details.
  39 #
  40 # ElementTree
  41 # Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
  42 #
  43 # fredrik@pythonware.com
  44 # http://www.pythonware.com
  45 # --------------------------------------------------------------------
  46 # The ElementTree toolkit is
  47 #
  48 # Copyright (c) 1999-2008 by Fredrik Lundh
  49 #
  50 # By obtaining, using, and/or copying this software and/or its
  51 # associated documentation, you agree that you have read, understood,
  52 # and will comply with the following terms and conditions:
  53 #
  54 # Permission to use, copy, modify, and distribute this software and
  55 # its associated documentation for any purpose and without fee is
  56 # hereby granted, provided that the above copyright notice appears in
  57 # all copies, and that both that copyright notice and this permission
  58 # notice appear in supporting documentation, and that the name of
  59 # Secret Labs AB or the author not be used in advertising or publicity
  60 # pertaining to distribution of the software without specific, written
  61 # prior permission.
  62 #
  63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  65 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  70 # OF THIS SOFTWARE.
  71 # --------------------------------------------------------------------
  72
  73 __all__ = [
  74     # public symbols
  75     "Comment",
  76     "dump",
  77     "Element",
  78     "ElementTree",
  79     "fromstring",
  80     "fromstringlist",
  81     "iselement",
  82     "iterparse",
  83     "parse",
  84     "ParseError",
  85     "PI",
  86     "ProcessingInstruction",
  87     "QName",
  88     "SubElement",
  89     "tostring",
  90     "tostringlist",
  91     "TreeBuilder",
  92     "VERSION",
  93     "XML",
  94     "XMLID",
  95     "XMLParser",
  96     "register_namespace",
  97 ]
  98
  99 VERSION = "1.3.0"
 100
 101 import sys
 102 import re
 103 import warnings
 104 import io
 105 import contextlib
 106
 107 import ElementPath
 108
 109
 110 class ParseError(SyntaxError):
 111     """An error when parsing an XML document.
 112
 113     In addition to its exception value, a ParseError contains
 114     two extra attributes:
 115         'code'     - the specific exception code
 116         'position' - the line and column of the error
 117
 118     """
 119
 120     pass
 121
 122
 123 # --------------------------------------------------------------------
 124
 125
 126 def iselement(element):
 127     """Return True if *element* appears to be an Element."""
 128     return hasattr(element, "tag")
 129
 130
 131 class Element:
 132     """An XML element.
 133
 134     This class is the reference implementation of the Element interface.
 135
 136     An element's length is its number of subelements.  That means if you
 137     want to check if an element is truly empty, you should check BOTH
 138     its length AND its text attribute.
 139
 140     The element tag, attribute names, and attribute values can be either
 141     bytes or strings.
 142
 143     *tag* is the element name.  *attrib* is an optional dictionary containing
 144     element attributes. *extra* are additional element attributes given as
 145     keyword arguments.
 146
 147     Example form:
 148         <tag attrib>text<child/>...</tag>tail
 149
 150     """
 151
 152     tag = None
 153     """The element's name."""
 154
 155     attrib = None
 156     """Dictionary of the element's attributes."""
 157
 158     text = None
 159     """
 160     Text before first subelement. This is either a string or the value None.
 161     Note that if there is no text, this attribute may be either
 162     None or the empty string, depending on the parser.
 163
 164     """
 165
 166     tail = None
 167     """
 168     Text after this element's end tag, but before the next sibling element's
 169     start tag.  This is either a string or the value None.  Note that if there
 170     was no text, this attribute may be either None or an empty string,
 171     depending on the parser.
 172
 173     """
 174
 175     def __init__(self, tag, attrib={}, **extra):
 176         if not isinstance(attrib, dict):
 177             raise TypeError(
 178                 "attrib must be dict, not %s" % (attrib.__class__.__name__,)
 179             )
 180         attrib = attrib.copy()
 181         attrib.update(extra)
 182         self.tag = tag
 183         self.attrib = attrib
 184         self._children = []
 185
 186     def __repr__(self):
 187         return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
 188
 189     def makeelement(self, tag, attrib):
 190         """Create a new element with the same type.
 191
 192         *tag* is a string containing the element name.
 193         *attrib* is a dictionary containing the element attributes.
 194
 195         Do not call this method, use the SubElement factory function instead.
 196
 197         """
 198         return self.__class__(tag, attrib)
 199
 200     def copy(self):
 201         """Return copy of current element.
 202
 203         This creates a shallow copy. Subelements will be shared with the
 204         original tree.
 205
 206         """
 207         elem = self.makeelement(self.tag, self.attrib)
 208         elem.text = self.text
 209         elem.tail = self.tail
 210         elem[:] = self
 211         return elem
 212
 213     def __len__(self):
 214         return len(self._children)
 215
 216     def __bool__(self):
 217         warnings.warn(
 218             "The behavior of this method will change in future versions.  "
 219             "Use specific 'len(elem)' or 'elem is not None' test instead.",
 220             FutureWarning,
 221             stacklevel=2,
 222         )
 223         return len(self._children) != 0  # emulate old behaviour, for now
 224
 225     def __getitem__(self, index):
 226         return self._children[index]
 227
 228     def __setitem__(self, index, element):
 229         # if isinstance(index, slice):
 230         #     for elt in element:
 231         #         assert iselement(elt)
 232         # else:
 233         #     assert iselement(element)
 234         self._children[index] = element
 235
 236     def __delitem__(self, index):
 237         del self._children[index]
 238
 239     def append(self, subelement):
 240         """Add *subelement* to the end of this element.
 241
 242         The new element will appear in document order after the last existing
 243         subelement (or directly after the text, if it's the first subelement),
 244         but before the end tag for this element.
 245
 246         """
 247         self._assert_is_element(subelement)
 248         self._children.append(subelement)
 249
 250     def extend(self, elements):
 251         """Append subelements from a sequence.
 252
 253         *elements* is a sequence with zero or more elements.
 254
 255         """
 256         for element in elements:
 257             self._assert_is_element(element)
 258         self._children.extend(elements)
 259
 260     def insert(self, index, subelement):
 261         """Insert *subelement* at position *index*."""
 262         self._assert_is_element(subelement)
 263         self._children.insert(index, subelement)
 264
 265     def _assert_is_element(self, e):
 266         # Need to refer to the actual Python implementation, not the
 267         # shadowing C implementation.
 268         if not isinstance(e, _Element_Py):
 269             raise TypeError("expected an Element, not %s" % type(e).__name__)
 270
 271     def remove(self, subelement):
 272         """Remove matching subelement.
 273
 274         Unlike the find methods, this method compares elements based on
 275         identity, NOT ON tag value or contents.  To remove subelements by
 276         other means, the easiest way is to use a list comprehension to
 277         select what elements to keep, and then use slice assignment to update
 278         the parent element.
 279
 280         ValueError is raised if a matching element could not be found.
 281
 282         """
 283         # assert iselement(element)
 284         self._children.remove(subelement)
 285
 286     def getchildren(self):
 287         """(Deprecated) Return all subelements.
 288
 289         Elements are returned in document order.
 290
 291         """
 292         warnings.warn(
 293             "This method will be removed in future versions.  "
 294             "Use 'list(elem)' or iteration over elem instead.",
 295             DeprecationWarning,
 296             stacklevel=2,
 297         )
 298         return self._children
 299
 300     def find(self, path, namespaces=None):
 301         """Find first matching element by tag name or path.
 302
 303         *path* is a string having either an element tag or an XPath,
 304         *namespaces* is an optional mapping from namespace prefix to full name.
 305
 306         Return the first matching element, or None if no element was found.
 307
 308         """
 309         return ElementPath.find(self, path, namespaces)
 310
 311     def findtext(self, path, default=None, namespaces=None):
 312         """Find text for first matching element by tag name or path.
 313
 314         *path* is a string having either an element tag or an XPath,
 315         *default* is the value to return if the element was not found,
 316         *namespaces* is an optional mapping from namespace prefix to full name.
 317
 318         Return text content of first matching element, or default value if
 319         none was found.  Note that if an element is found having no text
 320         content, the empty string is returned.
 321
 322         """
 323         return ElementPath.findtext(self, path, default, namespaces)
 324
 325     def findall(self, path, namespaces=None):
 326         """Find all matching subelements by tag name or path.
 327
 328         *path* is a string having either an element tag or an XPath,
 329         *namespaces* is an optional mapping from namespace prefix to full name.
 330
 331         Returns list containing all matching elements in document order.
 332
 333         """
 334         return ElementPath.findall(self, path, namespaces)
 335
 336     def iterfind(self, path, namespaces=None):
 337         """Find all matching subelements by tag name or path.
 338
 339         *path* is a string having either an element tag or an XPath,
 340         *namespaces* is an optional mapping from namespace prefix to full name.
 341
 342         Return an iterable yielding all matching elements in document order.
 343
 344         """
 345         return ElementPath.iterfind(self, path, namespaces)
 346
 347     def clear(self):
 348         """Reset element.
 349
 350         This function removes all subelements, clears all attributes, and sets
 351         the text and tail attributes to None.
 352
 353         """
 354         self.attrib.clear()
 355         self._children = []
 356         self.text = self.tail = None
 357
 358     def get(self, key, default=None):
 359         """Get element attribute.
 360
 361         Equivalent to attrib.get, but some implementations may handle this a
 362         bit more efficiently.  *key* is what attribute to look for, and
 363         *default* is what to return if the attribute was not found.
 364
 365         Returns a string containing the attribute value, or the default if
 366         attribute was not found.
 367
 368         """
 369         return self.attrib.get(key, default)
 370
 371     def set(self, key, value):
 372         """Set element attribute.
 373
 374         Equivalent to attrib[key] = value, but some implementations may handle
 375         this a bit more efficiently.  *key* is what attribute to set, and
 376         *value* is the attribute value to set it to.
 377
 378         """
 379         self.attrib[key] = value
 380
 381     def keys(self):
 382         """Get list of attribute names.
 383
 384         Names are returned in an arbitrary order, just like an ordinary
 385         Python dict.  Equivalent to attrib.keys()
 386
 387         """
 388         return self.attrib.keys()
 389
 390     def items(self):
 391         """Get element attributes as a sequence.
 392
 393         The attributes are returned in arbitrary order.  Equivalent to
 394         attrib.items().
 395
 396         Return a list of (name, value) tuples.
 397
 398         """
 399         return self.attrib.items()
 400
 401     def iter(self, tag=None):
 402         """Create tree iterator.
 403
 404         The iterator loops over the element and all subelements in document
 405         order, returning all elements with a matching tag.
 406
 407         If the tree structure is modified during iteration, new or removed
 408         elements may or may not be included.  To get a stable set, use the
 409         list() function on the iterator, and loop over the resulting list.
 410
 411         *tag* is what tags to look for (default is to return all elements)
 412
 413         Return an iterator containing all the matching elements.
 414
 415         """
 416         if tag == "*":
 417             tag = None
 418         if tag is None or self.tag == tag:
 419             yield self
 420         for e in self._children:
 421             yield from e.iter(tag)
 422
 423     # compatibility
 424     def getiterator(self, tag=None):
 425         # Change for a DeprecationWarning in 1.4
 426         warnings.warn(
 427             "This method will be removed in future versions.  "
 428             "Use 'elem.iter()' or 'list(elem.iter())' instead.",
 429             PendingDeprecationWarning,
 430             stacklevel=2,
 431         )
 432         return list(self.iter(tag))
 433
 434     def itertext(self):
 435         """Create text iterator.
 436
 437         The iterator loops over the element and all subelements in document
 438         order, returning all inner text.
 439
 440         """
 441         tag = self.tag
 442         if not isinstance(tag, str) and tag is not None:
 443             return
 444         if self.text:
 445             yield self.text
 446         for e in self:
 447             yield from e.itertext()
 448             if e.tail:
 449                 yield e.tail
 450
 451
 452 def SubElement(parent, tag, attrib={}, **extra):
 453     """Subelement factory which creates an element instance, and appends it
 454     to an existing parent.
 455
 456     The element tag, attribute names, and attribute values can be either
 457     bytes or Unicode strings.
 458
 459     *parent* is the parent element, *tag* is the subelements name, *attrib* is
 460     an optional directory containing element attributes, *extra* are
 461     additional attributes given as keyword arguments.
 462
 463     """
 464     attrib = attrib.copy()
 465     attrib.update(extra)
 466     element = parent.makeelement(tag, attrib)
 467     parent.append(element)
 468     return element
 469
 470
 471 def Comment(text=None):
 472     """Comment element factory.
 473
 474     This function creates a special element which the standard serializer
 475     serializes as an XML comment.
 476
 477     *text* is a string containing the comment string.
 478
 479     """
 480     element = Element(Comment)
 481     element.text = text
 482     return element
 483
 484
 485 def ProcessingInstruction(target, text=None):
 486     """Processing Instruction element factory.
 487
 488     This function creates a special element which the standard serializer
 489     serializes as an XML comment.
 490
 491     *target* is a string containing the processing instruction, *text* is a
 492     string containing the processing instruction contents, if any.
 493
 494     """
 495     element = Element(ProcessingInstruction)
 496     element.text = target
 497     if text:
 498         element.text = element.text + " " + text
 499     return element
 500
 501
 502 PI = ProcessingInstruction
 503
 504
 505 class QName:
 506     """Qualified name wrapper.
 507
 508     This class can be used to wrap a QName attribute value in order to get
 509     proper namespace handing on output.
 510
 511     *text_or_uri* is a string containing the QName value either in the form
 512     {uri}local, or if the tag argument is given, the URI part of a QName.
 513
 514     *tag* is an optional argument which if given, will make the first
 515     argument (text_or_uri) be interpreted as a URI, and this argument (tag)
 516     be interpreted as a local name.
 517
 518     """
 519
 520     def __init__(self, text_or_uri, tag=None):
 521         if tag:
 522             text_or_uri = "{%s}%s" % (text_or_uri, tag)
 523         self.text = text_or_uri
 524
 525     def __str__(self):
 526         return self.text
 527
 528     def __repr__(self):
 529         return "<QName %r>" % (self.text,)
 530
 531     def __hash__(self):
 532         return hash(self.text)
 533
 534     def __le__(self, other):
 535         if isinstance(other, QName):
 536             return self.text <= other.text
 537         return self.text <= other
 538
 539     def __lt__(self, other):
 540         if isinstance(other, QName):
 541             return self.text < other.text
 542         return self.text < other
 543
 544     def __ge__(self, other):
 545         if isinstance(other, QName):
 546             return self.text >= other.text
 547         return self.text >= other
 548
 549     def __gt__(self, other):
 550         if isinstance(other, QName):
 551             return self.text > other.text
 552         return self.text > other
 553
 554     def __eq__(self, other):
 555         if isinstance(other, QName):
 556             return self.text == other.text
 557         return self.text == other
 558
 559     def __ne__(self, other):
 560         if isinstance(other, QName):
 561             return self.text != other.text
 562         return self.text != other
 563
 564
 565 # --------------------------------------------------------------------
 566
 567
 568 class ElementTree:
 569     """An XML element hierarchy.
 570
 571     This class also provides support for serialization to and from
 572     standard XML.
 573
 574     *element* is an optional root element node,
 575     *file* is an optional file handle or file name of an XML file whose
 576     contents will be used to initialize the tree with.
 577
 578     """
 579
 580     def __init__(self, element=None, file=None):
 581         # assert element is None or iselement(element)
 582         self._root = element  # first node
 583         if file:
 584             self.parse(file)
 585
 586     def getroot(self):
 587         """Return root element of this tree."""
 588         return self._root
 589
 590     def _setroot(self, element):
 591         """Replace root element of this tree.
 592
 593         This will discard the current contents of the tree and replace it
 594         with the given element.  Use with care!
 595
 596         """
 597         # assert iselement(element)
 598         self._root = element
 599
 600     def parse(self, source, parser=None):
 601         """Load external XML document into element tree.
 602
 603         *source* is a file name or file object, *parser* is an optional parser
 604         instance that defaults to XMLParser.
 605
 606         ParseError is raised if the parser fails to parse the document.
 607
 608         Returns the root element of the given source document.
 609
 610         """
 611         close_source = False
 612         if not hasattr(source, "read"):
 613             source = open(source, "rb")
 614             close_source = True
 615         try:
 616             if parser is None:
 617                 # If no parser was specified, create a default XMLParser
 618                 parser = XMLParser()
 619                 if hasattr(parser, "_parse_whole"):
 620                     # The default XMLParser, when it comes from an accelerator,
 621                     # can define an internal _parse_whole API for efficiency.
 622                     # It can be used to parse the whole source without feeding
 623                     # it with chunks.
 624                     self._root = parser._parse_whole(source)
 625                     return self._root
 626             while True:
 627                 data = source.read(65536)
 628                 if not data:
 629                     break
 630                 parser.feed(data)
 631             self._root = parser.close()
 632             return self._root
 633         finally:
 634             if close_source:
 635                 source.close()
 636
 637     def iter(self, tag=None):
 638         """Create and return tree iterator for the root element.
 639
 640         The iterator loops over all elements in this tree, in document order.
 641
 642         *tag* is a string with the tag name to iterate over
 643         (default is to return all elements).
 644
 645         """
 646         # assert self._root is not None
 647         return self._root.iter(tag)
 648
 649     # compatibility
 650     def getiterator(self, tag=None):
 651         # Change for a DeprecationWarning in 1.4
 652         warnings.warn(
 653             "This method will be removed in future versions.  "
 654             "Use 'tree.iter()' or 'list(tree.iter())' instead.",
 655             PendingDeprecationWarning,
 656             stacklevel=2,
 657         )
 658         return list(self.iter(tag))
 659
 660     def find(self, path, namespaces=None):
 661         """Find first matching element by tag name or path.
 662
 663         Same as getroot().find(path), which is Element.find()
 664
 665         *path* is a string having either an element tag or an XPath,
 666         *namespaces* is an optional mapping from namespace prefix to full name.
 667
 668         Return the first matching element, or None if no element was found.
 669
 670         """
 671         # assert self._root is not None
 672         if path[:1] == "/":
 673             path = "." + path
 674             warnings.warn(
 675                 "This search is broken in 1.3 and earlier, and will be "
 676                 "fixed in a future version.  If you rely on the current "
 677                 "behaviour, change it to %r" % path,
 678                 FutureWarning,
 679                 stacklevel=2,
 680             )
 681         return self._root.find(path, namespaces)
 682
 683     def findtext(self, path, default=None, namespaces=None):
 684         """Find first matching element by tag name or path.
 685
 686         Same as getroot().findtext(path),  which is Element.findtext()
 687
 688         *path* is a string having either an element tag or an XPath,
 689         *namespaces* is an optional mapping from namespace prefix to full name.
 690
 691         Return the first matching element, or None if no element was found.
 692
 693         """
 694         # assert self._root is not None
 695         if path[:1] == "/":
 696             path = "." + path
 697             warnings.warn(
 698                 "This search is broken in 1.3 and earlier, and will be "
 699                 "fixed in a future version.  If you rely on the current "
 700                 "behaviour, change it to %r" % path,
 701                 FutureWarning,
 702                 stacklevel=2,
 703             )
 704         return self._root.findtext(path, default, namespaces)
 705
 706     def findall(self, path, namespaces=None):
 707         """Find all matching subelements by tag name or path.
 708
 709         Same as getroot().findall(path), which is Element.findall().
 710
 711         *path* is a string having either an element tag or an XPath,
 712         *namespaces* is an optional mapping from namespace prefix to full name.
 713
 714         Return list containing all matching elements in document order.
 715
 716         """
 717         # assert self._root is not None
 718         if path[:1] == "/":
 719             path = "." + path
 720             warnings.warn(
 721                 "This search is broken in 1.3 and earlier, and will be "
 722                 "fixed in a future version.  If you rely on the current "
 723                 "behaviour, change it to %r" % path,
 724                 FutureWarning,
 725                 stacklevel=2,
 726             )
 727         return self._root.findall(path, namespaces)
 728
 729     def iterfind(self, path, namespaces=None):
 730         """Find all matching subelements by tag name or path.
 731
 732         Same as getroot().iterfind(path), which is element.iterfind()
 733
 734         *path* is a string having either an element tag or an XPath,
 735         *namespaces* is an optional mapping from namespace prefix to full name.
 736
 737         Return an iterable yielding all matching elements in document order.
 738
 739         """
 740         # assert self._root is not None
 741         if path[:1] == "/":
 742             path = "." + path
 743             warnings.warn(
 744                 "This search is broken in 1.3 and earlier, and will be "
 745                 "fixed in a future version.  If you rely on the current "
 746                 "behaviour, change it to %r" % path,
 747                 FutureWarning,
 748                 stacklevel=2,
 749             )
 750         return self._root.iterfind(path, namespaces)
 751
 752     def write(
 753         self,
 754         file_or_filename,
 755         encoding=None,
 756         xml_declaration=None,
 757         default_namespace=None,
 758         method=None,
 759         *,
 760         short_empty_elements=True
 761     ):
 762         """Write element tree to a file as XML.
 763
 764         Arguments:
 765           *file_or_filename* -- file name or a file object opened for writing
 766
 767           *encoding* -- the output encoding (default: US-ASCII)
 768
 769           *xml_declaration* -- bool indicating if an XML declaration should be
 770                                added to the output. If None, an XML declaration
 771                                is added if encoding IS NOT either of:
 772                                US-ASCII, UTF-8, or Unicode
 773
 774           *default_namespace* -- sets the default XML namespace (for "xmlns")
 775
 776           *method* -- either "xml" (default), "html, "text", or "c14n"
 777
 778           *short_empty_elements* -- controls the formatting of elements
 779                                     that contain no content. If True (default)
 780                                     they are emitted as a single self-closed
 781                                     tag, otherwise they are emitted as a pair
 782                                     of start/end tags
 783
 784         """
 785         if not method:
 786             method = "xml"
 787         elif method not in _serialize:
 788             raise ValueError("unknown method %r" % method)
 789         if not encoding:
 790             if method == "c14n":
 791                 encoding = "utf-8"
 792             else:
 793                 encoding = "us-ascii"
 794         enc_lower = encoding.lower()
 795         with _get_writer(file_or_filename, enc_lower) as write:
 796             if method == "xml" and (
 797                 xml_declaration
 798                 or (
 799                     xml_declaration is None
 800                     and enc_lower not in ("utf-8", "us-ascii", "unicode")
 801                 )
 802             ):
 803                 declared_encoding = encoding
 804                 if enc_lower == "unicode":
 805                     # Retrieve the default encoding for the xml declaration
 806                     import locale
 807
 808                     declared_encoding = locale.getpreferredencoding()
 809                 write("<?xml version='1.0' encoding='%s'?>\n" % (declared_encoding,))
 810             if method == "text":
 811                 _serialize_text(write, self._root)
 812             else:
 813                 qnames, namespaces = _namespaces(self._root, default_namespace)
 814                 serialize = _serialize[method]
 815                 serialize(
 816                     write,
 817                     self._root,
 818                     qnames,
 819                     namespaces,
 820                     short_empty_elements=short_empty_elements,
 821                 )
 822
 823     def write_c14n(self, file):
 824         # lxml.etree compatibility.  use output method instead
 825         return self.write(file, method="c14n")
 826
 827
 828 # --------------------------------------------------------------------
 829 # serialization support
 830
 831
 832 @contextlib.contextmanager
 833 def _get_writer(file_or_filename, encoding):
 834     # returns text write method and release all resources after using
 835     try:
 836         write = file_or_filename.write
 837     except AttributeError:
 838         # file_or_filename is a file name
 839         if encoding == "unicode":
 840             file = open(file_or_filename, "w")
 841         else:
 842             file = open(
 843                 file_or_filename, "w", encoding=encoding, errors="xmlcharrefreplace"
 844             )
 845         with file:
 846             yield file.write
 847     else:
 848         # file_or_filename is a file-like object
 849         # encoding determines if it is a text or binary writer
 850         if encoding == "unicode":
 851             # use a text writer as is
 852             yield write
 853         else:
 854             # wrap a binary writer with TextIOWrapper
 855             with contextlib.ExitStack() as stack:
 856                 if isinstance(file_or_filename, io.BufferedIOBase):
 857                     file = file_or_filename
 858                 elif isinstance(file_or_filename, io.RawIOBase):
 859                     file = io.BufferedWriter(file_or_filename)
 860                     # Keep the original file open when the BufferedWriter is
 861                     # destroyed
 862                     stack.callback(file.detach)
 863                 else:
 864                     # This is to handle passed objects that aren't in the
 865                     # IOBase hierarchy, but just have a write method
 866                     file = io.BufferedIOBase()
 867                     file.writable = lambda: True
 868                     file.write = write
 869                     try:
 870                         # TextIOWrapper uses this methods to determine
 871                         # if BOM (for UTF-16, etc) should be added
 872                         file.seekable = file_or_filename.seekable
 873                         file.tell = file_or_filename.tell
 874                     except AttributeError:
 875                         pass
 876                 file = io.TextIOWrapper(
 877                     file, encoding=encoding, errors="xmlcharrefreplace", newline="\n"
 878                 )
 879                 # Keep the original file open when the TextIOWrapper is
 880                 # destroyed
 881                 stack.callback(file.detach)
 882                 yield file.write
 883
 884
 885 def _namespaces(elem, default_namespace=None):
 886     # identify namespaces used in this tree
 887
 888     # maps qnames to *encoded* prefix:local names
 889     qnames = {None: None}
 890
 891     # maps uri:s to prefixes
 892     namespaces = {}
 893     if default_namespace:
 894         namespaces[default_namespace] = ""
 895
 896     def add_qname(qname):
 897         # calculate serialized qname representation
 898         try:
 899             if qname[:1] == "{":
 900                 uri, tag = qname[1:].rsplit("}", 1)
 901                 prefix = namespaces.get(uri)
 902                 if prefix is None:
 903                     prefix = _namespace_map.get(uri)
 904                     if prefix is None:
 905                         prefix = "ns%d" % len(namespaces)
 906                     if prefix != "xml":
 907                         namespaces[uri] = prefix
 908                 if prefix:
 909                     qnames[qname] = "%s:%s" % (prefix, tag)
 910                 else:
 911                     qnames[qname] = tag  # default element
 912             else:
 913                 if default_namespace:
 914                     # FIXME: can this be handled in XML 1.0?
 915                     raise ValueError(
 916                         "cannot use non-qualified names with "
 917                         "default_namespace option"
 918                     )
 919                 qnames[qname] = qname
 920         except TypeError:
 921             _raise_serialization_error(qname)
 922
 923     # populate qname and namespaces table
 924     for elem in elem.iter():
 925         tag = elem.tag
 926         if isinstance(tag, QName):
 927             if tag.text not in qnames:
 928                 add_qname(tag.text)
 929         elif isinstance(tag, str):
 930             if tag not in qnames:
 931                 add_qname(tag)
 932         elif tag is not None and tag is not Comment and tag is not PI:
 933             _raise_serialization_error(tag)
 934         for key, value in elem.items():
 935             if isinstance(key, QName):
 936                 key = key.text
 937             if key not in qnames:
 938                 add_qname(key)
 939             if isinstance(value, QName) and value.text not in qnames:
 940                 add_qname(value.text)
 941         text = elem.text
 942         if isinstance(text, QName) and text.text not in qnames:
 943             add_qname(text.text)
 944     return qnames, namespaces
 945
 946
 947 def _serialize_xml(write, elem, qnames, namespaces, short_empty_elements, **kwargs):
 948     tag = elem.tag
 949     text = elem.text
 950     if tag is Comment:
 951         write("<!--%s-->" % text)
 952     elif tag is ProcessingInstruction:
 953         write("<?%s?>" % text)
 954     else:
 955         tag = qnames[tag]
 956         if tag is None:
 957             if text:
 958                 write(_escape_cdata(text))
 959             for e in elem:
 960                 _serialize_xml(
 961                     write, e, qnames, None, short_empty_elements=short_empty_elements
 962                 )
 963         else:
 964             write("<" + tag)
 965             items = list(elem.items())
 966             if items or namespaces:
 967                 if namespaces:
 968                     for v, k in sorted(
 969                         namespaces.items(), key=lambda x: x[1]
 970                     ):  # sort on prefix
 971                         if k:
 972                             k = ":" + k
 973                         write(' xmlns%s="%s"' % (k, _escape_attrib(v)))
 974                 for k, v in sorted(items):  # lexical order
 975                     if isinstance(k, QName):
 976                         k = k.text
 977                     if isinstance(v, QName):
 978                         v = qnames[v.text]
 979                     else:
 980                         v = _escape_attrib(v)
 981                     write(' %s="%s"' % (qnames[k], v))
 982             if text or len(elem) or not short_empty_elements:
 983                 write(">")
 984                 if text:
 985                     write(_escape_cdata(text))
 986                 for e in elem:
 987                     _serialize_xml(
 988                         write,
 989                         e,
 990                         qnames,
 991                         None,
 992                         short_empty_elements=short_empty_elements,
 993                     )
 994                 write("</" + tag + ">")
 995             else:
 996                 write(" />")
 997     if elem.tail:
 998         write(_escape_cdata(elem.tail))
 999
1000
1001 # add from cvw jan 2019
1002 def _serialize_pretty_xml(
1003     write, elem, qnames, namespaces, short_empty_elements, indent=0
1004 ):
1005     # print("*****pretty***** indent", elem.tag, indent)
1006     tag = elem.tag
1007     text = elem.text
1008     if tag is Comment:
1009         write("<!--%s-->" % text)
1010     elif tag is ProcessingInstruction:
1011         write("<?%s?>" % text)
1012     else:
1013         tag = qnames[tag]
1014         if tag is None:
1015             if text:
1016                 write(_escape_cdata(text))
1017             for e in elem:
1018                 _serialize_pretty_xml(
1019                     write,
1020                     e,
1021                     qnames,
1022                     None,
1023                     short_empty_elements=short_empty_elements,
1024                     indent=indent,
1025                 )
1026         else:
1027             write(" " * indent + "<" + tag)
1028             items = list(elem.items())
1029             if items or namespaces:
1030                 if namespaces:
1031                     for v, k in sorted(
1032                         namespaces.items(), key=lambda x: x[1]
1033                     ):  # sort on prefix
1034                         if k:
1035                             k = ":" + k
1036                         write(' xmlns%s="%s"' % (k, _escape_attrib(v)))
1037                 for k, v in sorted(items):  # lexical order
1038                     # print("atrrib ", k, v)
1039                     if isinstance(k, QName):
1040                         k = k.text
1041                     if isinstance(v, QName):
1042                         v = qnames[v.text]
1043                     else:
1044                         v = _escape_attrib(v)
1045                     write(' %s="%s"' % (qnames[k], v))
1046             if text or len(elem) or not short_empty_elements:
1047                 if text:
1048                     write(">")
1049                     write(_escape_cdata(text))
1050                 else:
1051                     write(">\n")
1052
1053                 for e in elem:
1054                     _serialize_pretty_xml(
1055                         write,
1056                         e,
1057                         qnames,
1058                         None,
1059                         short_empty_elements=short_empty_elements,
1060                         indent=indent + 2,
1061                     )
1062                 write(" " * indent + "</" + tag + ">\n")
1063             else:
1064                 write(" />\n")
1065     if elem.tail:
1066         write(_escape_cdata(elem.tail))
1067
1068
1069 HTML_EMPTY = (
1070     "area",
1071     "base",
1072     "basefont",
1073     "br",
1074     "col",
1075     "frame",
1076     "hr",
1077     "img",
1078     "input",
1079     "isindex",
1080     "link",
1081     "meta",
1082     "param",
1083 )
1084
1085 try:
1086     HTML_EMPTY = set(HTML_EMPTY)
1087 except NameError:
1088     pass
1089
1090
1091 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
1092     tag = elem.tag
1093     text = elem.text
1094     if tag is Comment:
1095         write("<!--%s-->" % _escape_cdata(text))
1096     elif tag is ProcessingInstruction:
1097         write("<?%s?>" % _escape_cdata(text))
1098     else:
1099         tag = qnames[tag]
1100         if tag is None:
1101             if text:
1102                 write(_escape_cdata(text))
1103             for e in elem:
1104                 _serialize_html(write, e, qnames, None)
1105         else:
1106             write("<" + tag)
1107             items = list(elem.items())
1108             if items or namespaces:
1109                 if namespaces:
1110                     for v, k in sorted(
1111                         namespaces.items(), key=lambda x: x[1]
1112                     ):  # sort on prefix
1113                         if k:
1114                             k = ":" + k
1115                         write(' xmlns%s="%s"' % (k, _escape_attrib(v)))
1116                 for k, v in sorted(items):  # lexical order
1117                     if isinstance(k, QName):
1118                         k = k.text
1119                     if isinstance(v, QName):
1120                         v = qnames[v.text]
1121                     else:
1122                         v = _escape_attrib_html(v)
1123                     # FIXME: handle boolean attributes
1124                     write(' %s="%s"' % (qnames[k], v))
1125             write(">")
1126             ltag = tag.lower()
1127             if text:
1128                 if ltag == "script" or ltag == "style":
1129                     write(text)
1130                 else:
1131                     write(_escape_cdata(text))
1132             for e in elem:
1133                 _serialize_html(write, e, qnames, None)
1134             if ltag not in HTML_EMPTY:
1135                 write("</" + tag + ">")
1136     if elem.tail:
1137         write(_escape_cdata(elem.tail))
1138
1139
1140 def _serialize_text(write, elem):
1141     for part in elem.itertext():
1142         write(part)
1143     if elem.tail:
1144         write(elem.tail)
1145
1146
1147 _serialize = {
1148     "xml": _serialize_xml,
1149     "pretty_xml": _serialize_pretty_xml,
1150     "html": _serialize_html,
1151     "text": _serialize_text,
1152     # this optional method is imported at the end of the module
1153     #   "c14n": _serialize_c14n,
1154 }
1155
1156
1157 def register_namespace(prefix, uri):
1158     """Register a namespace prefix.
1159
1160     The registry is global, and any existing mapping for either the
1161     given prefix or the namespace URI will be removed.
1162
1163     *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1164     attributes in this namespace will be serialized with prefix if possible.
1165
1166     ValueError is raised if prefix is reserved or is invalid.
1167
1168     """
1169     if re.match("ns\d+$", prefix):
1170         raise ValueError("Prefix format reserved for internal use")
1171     for k, v in list(_namespace_map.items()):
1172         if k == uri or v == prefix:
1173             del _namespace_map[k]
1174     _namespace_map[uri] = prefix
1175
1176
1177 _namespace_map = {
1178     # "well-known" namespace prefixes
1179     "http://www.w3.org/XML/1998/namespace": "xml",
1180     "http://www.w3.org/1999/xhtml": "html",
1181     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1182     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1183     # xml schema
1184     "http://www.w3.org/2001/XMLSchema": "xs",
1185     "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1186     # dublin core
1187     "http://purl.org/dc/elements/1.1/": "dc",
1188 }
1189 # For tests and troubleshooting
1190 register_namespace._namespace_map = _namespace_map
1191
1192
1193 def _raise_serialization_error(text):
1194     raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__))
1195
1196
1197 def _escape_cdata(text):
1198     # escape character data
1199     try:
1200         # it's worth avoiding do-nothing calls for strings that are
1201         # shorter than 500 character, or so.  assume that's, by far,
1202         # the most common case in most applications.
1203         if "&" in text:
1204             text = text.replace("&", "&amp;")
1205         if "<" in text:
1206             text = text.replace("<", "&lt;")
1207         if ">" in text:
1208             text = text.replace(">", "&gt;")
1209         return text
1210     except (TypeError, AttributeError):
1211         _raise_serialization_error(text)
1212
1213
1214 def _escape_attrib(text):
1215     # escape attribute value
1216     try:
1217         if "&" in text:
1218             text = text.replace("&", "&amp;")
1219         if "<" in text:
1220             text = text.replace("<", "&lt;")
1221         if ">" in text:
1222             text = text.replace(">", "&gt;")
1223         if '"' in text:
1224             text = text.replace('"', "&quot;")
1225         if "\n" in text:
1226             text = text.replace("\n", "&#10;")
1227         return text
1228     except (TypeError, AttributeError):
1229         _raise_serialization_error(text)
1230
1231
1232 def _escape_attrib_html(text):
1233     # escape attribute value
1234     try:
1235         if "&" in text:
1236             text = text.replace("&", "&amp;")
1237         if ">" in text:
1238             text = text.replace(">", "&gt;")
1239         if '"' in text:
1240             text = text.replace('"', "&quot;")
1241         return text
1242     except (TypeError, AttributeError):
1243         _raise_serialization_error(text)
1244
1245
1246 # --------------------------------------------------------------------
1247
1248
1249 def tostring(element, encoding=None, method=None, *, short_empty_elements=True):
1250     """Generate string representation of XML element.
1251
1252     All subelements are included.  If encoding is "unicode", a string
1253     is returned. Otherwise a bytestring is returned.
1254
1255     *element* is an Element instance, *encoding* is an optional output
1256     encoding defaulting to US-ASCII, *method* is an optional output which can
1257     be one of "xml" (default), "html", "text" or "c14n".
1258
1259     Returns an (optionally) encoded string containing the XML data.
1260
1261     """
1262     stream = io.StringIO() if encoding == "unicode" else io.BytesIO()
1263     ElementTree(element).write(
1264         stream, encoding, method=method, short_empty_elements=short_empty_elements
1265     )
1266     return stream.getvalue()
1267
1268
1269 class _ListDataStream(io.BufferedIOBase):
1270     """An auxiliary stream accumulating into a list reference."""
1271
1272     def __init__(self, lst):
1273         self.lst = lst
1274
1275     def writable(self):
1276         return True
1277
1278     def seekable(self):
1279         return True
1280
1281     def write(self, b):
1282         self.lst.append(b)
1283
1284     def tell(self):
1285         return len(self.lst)
1286
1287
1288 def tostringlist(element, encoding=None, method=None, *, short_empty_elements=True):
1289     lst = []
1290     stream = _ListDataStream(lst)
1291     ElementTree(element).write(
1292         stream, encoding, method=method, short_empty_elements=short_empty_elements
1293     )
1294     return lst
1295
1296
1297 def dump(elem):
1298     """Write element tree or element structure to sys.stdout.
1299
1300     This function should be used for debugging only.
1301
1302     *elem* is either an ElementTree, or a single Element.  The exact output
1303     format is implementation dependent.  In this version, it's written as an
1304     ordinary XML file.
1305
1306     """
1307     # debugging
1308     if not isinstance(elem, ElementTree):
1309         elem = ElementTree(elem)
1310     elem.write(sys.stdout, encoding="unicode")
1311     tail = elem.getroot().tail
1312     if not tail or tail[-1] != "\n":
1313         sys.stdout.write("\n")
1314
1315
1316 # --------------------------------------------------------------------
1317 # parsing
1318
1319
1320 def parse(source, parser=None):
1321     """Parse XML document into element tree.
1322
1323     *source* is a filename or file object containing XML data,
1324     *parser* is an optional parser instance defaulting to XMLParser.
1325
1326     Return an ElementTree instance.
1327
1328     """
1329     tree = ElementTree()
1330     tree.parse(source, parser)
1331     return tree
1332
1333
1334 def iterparse(source, events=None, parser=None):
1335     """Incrementally parse XML document into ElementTree.
1336
1337     This class also reports what's going on to the user based on the
1338     *events* it is initialized with.  The supported events are the strings
1339     "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1340     detailed namespace information).  If *events* is omitted, only
1341     "end" events are reported.
1342
1343     *source* is a filename or file object containing XML data, *events* is
1344     a list of events to report back, *parser* is an optional parser instance.
1345
1346     Returns an iterator providing (event, elem) pairs.
1347
1348     """
1349     close_source = False
1350     if not hasattr(source, "read"):
1351         source = open(source, "rb")
1352         close_source = True
1353     try:
1354         return _IterParseIterator(source, events, parser, close_source)
1355     except:
1356         if close_source:
1357             source.close()
1358         raise
1359
1360
1361 class XMLPullParser:
1362     def __init__(self, events=None, *, _parser=None):
1363         # The _parser argument is for internal use only and must not be relied
1364         # upon in user code. It will be removed in a future release.
1365         # See http://bugs.python.org/issue17741 for more details.
1366
1367         # _elementtree.c expects a list, not a deque
1368         self._events_queue = []
1369         self._index = 0
1370         self._parser = _parser or XMLParser(target=TreeBuilder())
1371         # wire up the parser for event reporting
1372         if events is None:
1373             events = ("end",)
1374         self._parser._setevents(self._events_queue, events)
1375
1376     def feed(self, data):
1377         """Feed encoded data to parser."""
1378         if self._parser is None:
1379             raise ValueError("feed() called after end of stream")
1380         if data:
1381             try:
1382                 self._parser.feed(data)
1383             except SyntaxError as exc:
1384                 self._events_queue.append(exc)
1385
1386     def _close_and_return_root(self):
1387         # iterparse needs this to set its root attribute properly :(
1388         root = self._parser.close()
1389         self._parser = None
1390         return root
1391
1392     def close(self):
1393         """Finish feeding data to parser.
1394
1395         Unlike XMLParser, does not return the root element. Use
1396         read_events() to consume elements from XMLPullParser.
1397         """
1398         self._close_and_return_root()
1399
1400     def read_events(self):
1401         """Return an iterator over currently available (event, elem) pairs.
1402
1403         Events are consumed from the internal event queue as they are
1404         retrieved from the iterator.
1405         """
1406         events = self._events_queue
1407         while True:
1408             index = self._index
1409             try:
1410                 event = events[self._index]
1411                 # Avoid retaining references to past events
1412                 events[self._index] = None
1413             except IndexError:
1414                 break
1415             index += 1
1416             # Compact the list in a O(1) amortized fashion
1417             # As noted above, _elementree.c needs a list, not a deque
1418             if index * 2 >= len(events):
1419                 events[:index] = []
1420                 self._index = 0
1421             else:
1422                 self._index = index
1423             if isinstance(event, Exception):
1424                 raise event
1425             else:
1426                 yield event
1427
1428
1429 class _IterParseIterator:
1430     def __init__(self, source, events, parser, close_source=False):
1431         # Use the internal, undocumented _parser argument for now; When the
1432         # parser argument of iterparse is removed, this can be killed.
1433         self._parser = XMLPullParser(events=events, _parser=parser)
1434         self._file = source
1435         self._close_file = close_source
1436         self.root = self._root = None
1437
1438     def __next__(self):
1439         try:
1440             while 1:
1441                 for event in self._parser.read_events():
1442                     return event
1443                 if self._parser._parser is None:
1444                     break
1445                 # load event buffer
1446                 data = self._file.read(16 * 1024)
1447                 if data:
1448                     self._parser.feed(data)
1449                 else:
1450                     self._root = self._parser._close_and_return_root()
1451             self.root = self._root
1452         except:
1453             if self._close_file:
1454                 self._file.close()
1455             raise
1456         if self._close_file:
1457             self._file.close()
1458         raise StopIteration
1459
1460     def __iter__(self):
1461         return self
1462
1463
1464 def XML(text, parser=None):
1465     """Parse XML document from string constant.
1466
1467     This function can be used to embed "XML Literals" in Python code.
1468
1469     *text* is a string containing XML data, *parser* is an
1470     optional parser instance, defaulting to the standard XMLParser.
1471
1472     Returns an Element instance.
1473
1474     """
1475     if not parser:
1476         parser = XMLParser(target=TreeBuilder())
1477     parser.feed(text)
1478     return parser.close()
1479
1480
1481 def XMLID(text, parser=None):
1482     """Parse XML document from string constant for its IDs.
1483
1484     *text* is a string containing XML data, *parser* is an
1485     optional parser instance, defaulting to the standard XMLParser.
1486
1487     Returns an (Element, dict) tuple, in which the
1488     dict maps element id:s to elements.
1489
1490     """
1491     if not parser:
1492         parser = XMLParser(target=TreeBuilder())
1493     parser.feed(text)
1494     tree = parser.close()
1495     ids = {}
1496     for elem in tree.iter():
1497         id = elem.get("id")
1498         if id:
1499             ids[id] = elem
1500     return tree, ids
1501
1502
1503 # Parse XML document from string constant.  Alias for XML().
1504 fromstring = XML
1505
1506
1507 def fromstringlist(sequence, parser=None):
1508     """Parse XML document from sequence of string fragments.
1509
1510     *sequence* is a list of other sequence, *parser* is an optional parser
1511     instance, defaulting to the standard XMLParser.
1512
1513     Returns an Element instance.
1514
1515     """
1516     if not parser:
1517         parser = XMLParser(target=TreeBuilder())
1518     for text in sequence:
1519         parser.feed(text)
1520     return parser.close()
1521
1522
1523 # --------------------------------------------------------------------
1524
1525
1526 class TreeBuilder:
1527     """Generic element structure builder.
1528
1529     This builder converts a sequence of start, data, and end method
1530     calls to a well-formed element structure.
1531
1532     You can use this class to build an element structure using a custom XML
1533     parser, or a parser for some other XML-like format.
1534
1535     *element_factory* is an optional element factory which is called
1536     to create new Element instances, as necessary.
1537
1538     """
1539
1540     def __init__(self, element_factory=None):
1541         self._data = []  # data collector
1542         self._elem = []  # element stack
1543         self._last = None  # last element
1544         self._tail = None  # true if we're after an end tag
1545         if element_factory is None:
1546             element_factory = Element
1547         self._factory = element_factory
1548
1549     def close(self):
1550         """Flush builder buffers and return toplevel document Element."""
1551         assert len(self._elem) == 0, "missing end tags"
1552         assert self._last is not None, "missing toplevel element"
1553         return self._last
1554
1555     def _flush(self):
1556         if self._data:
1557             if self._last is not None:
1558                 text = "".join(self._data)
1559                 if self._tail:
1560                     assert self._last.tail is None, "internal error (tail)"
1561                     self._last.tail = text
1562                 else:
1563                     assert self._last.text is None, "internal error (text)"
1564                     self._last.text = text
1565             self._data = []
1566
1567     def data(self, data):
1568         """Add text to current element."""
1569         self._data.append(data)
1570
1571     def start(self, tag, attrs):
1572         """Open new element and return it.
1573
1574         *tag* is the element name, *attrs* is a dict containing element
1575         attributes.
1576
1577         """
1578         self._flush()
1579         self._last = elem = self._factory(tag, attrs)
1580         if self._elem:
1581             self._elem[-1].append(elem)
1582         self._elem.append(elem)
1583         self._tail = 0
1584         return elem
1585
1586     def end(self, tag):
1587         """Close and return current Element.
1588
1589         *tag* is the element name.
1590
1591         """
1592         self._flush()
1593         self._last = self._elem.pop()
1594         assert self._last.tag == tag, "end tag mismatch (expected %s, got %s)" % (
1595             self._last.tag,
1596             tag,
1597         )
1598         self._tail = 1
1599         return self._last
1600
1601
1602 # also see ElementTree and TreeBuilder
1603 class XMLParser:
1604     """Element structure builder for XML source data based on the expat parser.
1605
1606     *html* are predefined HTML entities (not supported currently),
1607     *target* is an optional target object which defaults to an instance of the
1608     standard TreeBuilder class, *encoding* is an optional encoding string
1609     which if given, overrides the encoding specified in the XML file:
1610     http://www.iana.org/assignments/character-sets
1611
1612     """
1613
1614     def __init__(self, html=0, target=None, encoding=None):
1615         try:
1616             from xml.parsers import expat
1617         except ImportError:
1618             try:
1619                 import pyexpat as expat
1620             except ImportError:
1621                 raise ImportError(
1622                     "No module named expat; use SimpleXMLTreeBuilder instead"
1623                 )
1624         parser = expat.ParserCreate(encoding, "}")
1625         if target is None:
1626             target = TreeBuilder()
1627         # underscored names are provided for compatibility only
1628         self.parser = self._parser = parser
1629         self.target = self._target = target
1630         self._error = expat.error
1631         self._names = {}  # name memo cache
1632         # main callbacks
1633         parser.DefaultHandlerExpand = self._default
1634         if hasattr(target, "start"):
1635             parser.StartElementHandler = self._start
1636         if hasattr(target, "end"):
1637             parser.EndElementHandler = self._end
1638         if hasattr(target, "data"):
1639             parser.CharacterDataHandler = target.data
1640         # miscellaneous callbacks
1641         if hasattr(target, "comment"):
1642             parser.CommentHandler = target.comment
1643         if hasattr(target, "pi"):
1644             parser.ProcessingInstructionHandler = target.pi
1645         # Configure pyexpat: buffering, new-style attribute handling.
1646         parser.buffer_text = 1
1647         parser.ordered_attributes = 1
1648         parser.specified_attributes = 1
1649         self._doctype = None
1650         self.entity = {}
1651         try:
1652             self.version = "Expat %d.%d.%d" % expat.version_info
1653         except AttributeError:
1654             pass  # unknown
1655
1656     def _setevents(self, events_queue, events_to_report):
1657         # Internal API for XMLPullParser
1658         # events_to_report: a list of events to report during parsing (same as
1659         # the *events* of XMLPullParser's constructor.
1660         # events_queue: a list of actual parsing events that will be populated
1661         # by the underlying parser.
1662         #
1663         parser = self._parser
1664         append = events_queue.append
1665         for event_name in events_to_report:
1666             if event_name == "start":
1667                 parser.ordered_attributes = 1
1668                 parser.specified_attributes = 1
1669
1670                 def handler(
1671                     tag, attrib_in, event=event_name, append=append, start=self._start
1672                 ):
1673                     append((event, start(tag, attrib_in)))
1674
1675                 parser.StartElementHandler = handler
1676             elif event_name == "end":
1677
1678                 def handler(tag, event=event_name, append=append, end=self._end):
1679                     append((event, end(tag)))
1680
1681                 parser.EndElementHandler = handler
1682             elif event_name == "start-ns":
1683
1684                 def handler(prefix, uri, event=event_name, append=append):
1685                     append((event, (prefix or "", uri or "")))
1686
1687                 parser.StartNamespaceDeclHandler = handler
1688             elif event_name == "end-ns":
1689
1690                 def handler(prefix, event=event_name, append=append):
1691                     append((event, None))
1692
1693                 parser.EndNamespaceDeclHandler = handler
1694             else:
1695                 raise ValueError("unknown event %r" % event_name)
1696
1697     def _raiseerror(self, value):
1698         err = ParseError(value)
1699         err.code = value.code
1700         err.position = value.lineno, value.offset
1701         raise err
1702
1703     def _fixname(self, key):
1704         # expand qname, and convert name string to ascii, if possible
1705         try:
1706             name = self._names[key]
1707         except KeyError:
1708             name = key
1709             if "}" in name:
1710                 name = "{" + name
1711             self._names[key] = name
1712         return name
1713
1714     def _start(self, tag, attr_list):
1715         # Handler for expat's StartElementHandler. Since ordered_attributes
1716         # is set, the attributes are reported as a list of alternating
1717         # attribute name,value.
1718         fixname = self._fixname
1719         tag = fixname(tag)
1720         attrib = {}
1721         if attr_list:
1722             for i in range(0, len(attr_list), 2):
1723                 attrib[fixname(attr_list[i])] = attr_list[i + 1]
1724         return self.target.start(tag, attrib)
1725
1726     def _end(self, tag):
1727         return self.target.end(self._fixname(tag))
1728
1729     def _default(self, text):
1730         prefix = text[:1]
1731         if prefix == "&":
1732             # deal with undefined entities
1733             try:
1734                 data_handler = self.target.data
1735             except AttributeError:
1736                 return
1737             try:
1738                 data_handler(self.entity[text[1:-1]])
1739             except KeyError:
1740                 from xml.parsers import expat
1741
1742                 err = expat.error(
1743                     "undefined entity %s: line %d, column %d"
1744                     % (text, self.parser.ErrorLineNumber, self.parser.ErrorColumnNumber)
1745                 )
1746                 err.code = 11  # XML_ERROR_UNDEFINED_ENTITY
1747                 err.lineno = self.parser.ErrorLineNumber
1748                 err.offset = self.parser.ErrorColumnNumber
1749                 raise err
1750         elif prefix == "<" and text[:9] == "<!DOCTYPE":
1751             self._doctype = []  # inside a doctype declaration
1752         elif self._doctype is not None:
1753             # parse doctype contents
1754             if prefix == ">":
1755                 self._doctype = None
1756                 return
1757             text = text.strip()
1758             if not text:
1759                 return
1760             self._doctype.append(text)
1761             n = len(self._doctype)
1762             if n > 2:
1763                 type = self._doctype[1]
1764                 if type == "PUBLIC" and n == 4:
1765                     name, type, pubid, system = self._doctype
1766                     if pubid:
1767                         pubid = pubid[1:-1]
1768                 elif type == "SYSTEM" and n == 3:
1769                     name, type, system = self._doctype
1770                     pubid = None
1771                 else:
1772                     return
1773                 if hasattr(self.target, "doctype"):
1774                     self.target.doctype(name, pubid, system[1:-1])
1775                 elif self.doctype != self._XMLParser__doctype:
1776                     # warn about deprecated call
1777                     self._XMLParser__doctype(name, pubid, system[1:-1])
1778                     self.doctype(name, pubid, system[1:-1])
1779                 self._doctype = None
1780
1781     def doctype(self, name, pubid, system):
1782         """(Deprecated)  Handle doctype declaration
1783
1784         *name* is the Doctype name, *pubid* is the public identifier,
1785         and *system* is the system identifier.
1786
1787         """
1788         warnings.warn(
1789             "This method of XMLParser is deprecated.  Define doctype() "
1790             "method on the TreeBuilder target.",
1791             DeprecationWarning,
1792         )
1793
1794     # sentinel, if doctype is redefined in a subclass
1795     __doctype = doctype
1796
1797     def feed(self, data):
1798         """Feed encoded data to parser."""
1799         try:
1800             self.parser.Parse(data, 0)
1801         except self._error as v:
1802             self._raiseerror(v)
1803
1804     def close(self):
1805         """Finish feeding data to parser and return element structure."""
1806         try:
1807             self.parser.Parse("", 1)  # end of data
1808         except self._error as v:
1809             self._raiseerror(v)
1810         try:
1811             close_handler = self.target.close
1812         except AttributeError:
1813             pass
1814         else:
1815             return close_handler()
1816         finally:
1817             # get rid of circular references
1818             del self.parser, self._parser
1819             del self.target, self._target
1820
1821
1822 # Import the C accelerators
1823 try:
1824     # Element is going to be shadowed by the C implementation. We need to keep
1825     # the Python version of it accessible for some "creative" by external code
1826     # (see tests)
1827     _Element_Py = Element
1828
1829     # Element, SubElement, ParseError, TreeBuilder, XMLParser
1830     from _elementtree import *
1831 except ImportError:
1832     pass