1 """Lightweight XML support for Python.
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
6 1. ElementTree represents the whole XML document as a tree and
8 2. Element represents a single node in this tree.
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
18 'tag' - a string containing the element's name.
20 'attributes' - a Python dictionary storing the element's attributes.
22 'text' - a string containing the element's text content.
24 'tail' - an optional string containing text after the element's end tag.
26 And a number of child elements stored in a Python sequence.
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
36 #---------------------------------------------------------------------
37 # Licensed to PSF under a Contributor Agreement.
38 # See http://www.python.org/psf/license for licensing details.
41 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
43 # fredrik@pythonware.com
44 # http://www.pythonware.com
45 # --------------------------------------------------------------------
46 # The ElementTree toolkit is
48 # Copyright (c) 1999-2008 by Fredrik Lundh
50 # By obtaining, using, and/or copying this software and/or its
51 # associated documentation, you agree that you have read, understood,
52 # and will comply with the following terms and conditions:
54 # Permission to use, copy, modify, and distribute this software and
55 # its associated documentation for any purpose and without fee is
56 # hereby granted, provided that the above copyright notice appears in
57 # all copies, and that both that copyright notice and this permission
58 # notice appear in supporting documentation, and that the name of
59 # Secret Labs AB or the author not be used in advertising or publicity
60 # pertaining to distribution of the software without specific, written
63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
71 # --------------------------------------------------------------------
77 "Element", "ElementTree",
78 "fromstring", "fromstringlist",
79 "iselement", "iterparse",
80 "parse", "ParseError",
81 "PI", "ProcessingInstruction",
84 "tostring", "tostringlist",
103 class ParseError(SyntaxError):
104 """An error when parsing an XML document.
106 In addition to its exception value, a ParseError contains
107 two extra attributes:
108 'code' - the specific exception code
109 'position' - the line and column of the error
114 # --------------------------------------------------------------------
117 def iselement(element):
118 """Return True if *element* appears to be an Element."""
119 return hasattr(element, 'tag')
125 This class is the reference implementation of the Element interface.
127 An element's length is its number of subelements. That means if you
128 want to check if an element is truly empty, you should check BOTH
129 its length AND its text attribute.
131 The element tag, attribute names, and attribute values can be either
134 *tag* is the element name. *attrib* is an optional dictionary containing
135 element attributes. *extra* are additional element attributes given as
139 <tag attrib>text<child/>...</tag>tail
144 """The element's name."""
147 """Dictionary of the element's attributes."""
151 Text before first subelement. This is either a string or the value None.
152 Note that if there is no text, this attribute may be either
153 None or the empty string, depending on the parser.
159 Text after this element's end tag, but before the next sibling element's
160 start tag. This is either a string or the value None. Note that if there
161 was no text, this attribute may be either None or an empty string,
162 depending on the parser.
166 def __init__(self, tag, attrib={}, **extra):
167 if not isinstance(attrib, dict):
168 raise TypeError("attrib must be dict, not %s" % (
169 attrib.__class__.__name__,))
170 attrib = attrib.copy()
177 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
179 def makeelement(self, tag, attrib):
180 """Create a new element with the same type.
182 *tag* is a string containing the element name.
183 *attrib* is a dictionary containing the element attributes.
185 Do not call this method, use the SubElement factory function instead.
188 return self.__class__(tag, attrib)
191 """Return copy of current element.
193 This creates a shallow copy. Subelements will be shared with the
197 elem = self.makeelement(self.tag, self.attrib)
198 elem.text = self.text
199 elem.tail = self.tail
204 return len(self._children)
208 "The behavior of this method will change in future versions. "
209 "Use specific 'len(elem)' or 'elem is not None' test instead.",
210 FutureWarning, stacklevel=2
212 return len(self._children) != 0 # emulate old behaviour, for now
214 def __getitem__(self, index):
215 return self._children[index]
217 def __setitem__(self, index, element):
218 # if isinstance(index, slice):
219 # for elt in element:
220 # assert iselement(elt)
222 # assert iselement(element)
223 self._children[index] = element
225 def __delitem__(self, index):
226 del self._children[index]
228 def append(self, subelement):
229 """Add *subelement* to the end of this element.
231 The new element will appear in document order after the last existing
232 subelement (or directly after the text, if it's the first subelement),
233 but before the end tag for this element.
236 self._assert_is_element(subelement)
237 self._children.append(subelement)
239 def extend(self, elements):
240 """Append subelements from a sequence.
242 *elements* is a sequence with zero or more elements.
245 for element in elements:
246 self._assert_is_element(element)
247 self._children.extend(elements)
249 def insert(self, index, subelement):
250 """Insert *subelement* at position *index*."""
251 self._assert_is_element(subelement)
252 self._children.insert(index, subelement)
254 def _assert_is_element(self, e):
255 # Need to refer to the actual Python implementation, not the
256 # shadowing C implementation.
257 if not isinstance(e, _Element_Py):
258 raise TypeError('expected an Element, not %s' % type(e).__name__)
260 def remove(self, subelement):
261 """Remove matching subelement.
263 Unlike the find methods, this method compares elements based on
264 identity, NOT ON tag value or contents. To remove subelements by
265 other means, the easiest way is to use a list comprehension to
266 select what elements to keep, and then use slice assignment to update
269 ValueError is raised if a matching element could not be found.
272 # assert iselement(element)
273 self._children.remove(subelement)
275 def getchildren(self):
276 """(Deprecated) Return all subelements.
278 Elements are returned in document order.
282 "This method will be removed in future versions. "
283 "Use 'list(elem)' or iteration over elem instead.",
284 DeprecationWarning, stacklevel=2
286 return self._children
288 def find(self, path, namespaces=None):
289 """Find first matching element by tag name or path.
291 *path* is a string having either an element tag or an XPath,
292 *namespaces* is an optional mapping from namespace prefix to full name.
294 Return the first matching element, or None if no element was found.
297 return ElementPath.find(self, path, namespaces)
299 def findtext(self, path, default=None, namespaces=None):
300 """Find text for first matching element by tag name or path.
302 *path* is a string having either an element tag or an XPath,
303 *default* is the value to return if the element was not found,
304 *namespaces* is an optional mapping from namespace prefix to full name.
306 Return text content of first matching element, or default value if
307 none was found. Note that if an element is found having no text
308 content, the empty string is returned.
311 return ElementPath.findtext(self, path, default, namespaces)
313 def findall(self, path, namespaces=None):
314 """Find all matching subelements by tag name or path.
316 *path* is a string having either an element tag or an XPath,
317 *namespaces* is an optional mapping from namespace prefix to full name.
319 Returns list containing all matching elements in document order.
322 return ElementPath.findall(self, path, namespaces)
324 def iterfind(self, path, namespaces=None):
325 """Find all matching subelements by tag name or path.
327 *path* is a string having either an element tag or an XPath,
328 *namespaces* is an optional mapping from namespace prefix to full name.
330 Return an iterable yielding all matching elements in document order.
333 return ElementPath.iterfind(self, path, namespaces)
338 This function removes all subelements, clears all attributes, and sets
339 the text and tail attributes to None.
344 self.text = self.tail = None
346 def get(self, key, default=None):
347 """Get element attribute.
349 Equivalent to attrib.get, but some implementations may handle this a
350 bit more efficiently. *key* is what attribute to look for, and
351 *default* is what to return if the attribute was not found.
353 Returns a string containing the attribute value, or the default if
354 attribute was not found.
357 return self.attrib.get(key, default)
359 def set(self, key, value):
360 """Set element attribute.
362 Equivalent to attrib[key] = value, but some implementations may handle
363 this a bit more efficiently. *key* is what attribute to set, and
364 *value* is the attribute value to set it to.
367 self.attrib[key] = value
370 """Get list of attribute names.
372 Names are returned in an arbitrary order, just like an ordinary
373 Python dict. Equivalent to attrib.keys()
376 return self.attrib.keys()
379 """Get element attributes as a sequence.
381 The attributes are returned in arbitrary order. Equivalent to
384 Return a list of (name, value) tuples.
387 return self.attrib.items()
389 def iter(self, tag=None):
390 """Create tree iterator.
392 The iterator loops over the element and all subelements in document
393 order, returning all elements with a matching tag.
395 If the tree structure is modified during iteration, new or removed
396 elements may or may not be included. To get a stable set, use the
397 list() function on the iterator, and loop over the resulting list.
399 *tag* is what tags to look for (default is to return all elements)
401 Return an iterator containing all the matching elements.
406 if tag is None or self.tag == tag:
408 for e in self._children:
409 yield from e.iter(tag)
412 def getiterator(self, tag=None):
413 # Change for a DeprecationWarning in 1.4
415 "This method will be removed in future versions. "
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
417 PendingDeprecationWarning, stacklevel=2
419 return list(self.iter(tag))
422 """Create text iterator.
424 The iterator loops over the element and all subelements in document
425 order, returning all inner text.
429 if not isinstance(tag, str) and tag is not None:
434 yield from e.itertext()
439 def SubElement(parent, tag, attrib={}, **extra):
440 """Subelement factory which creates an element instance, and appends it
441 to an existing parent.
443 The element tag, attribute names, and attribute values can be either
444 bytes or Unicode strings.
446 *parent* is the parent element, *tag* is the subelements name, *attrib* is
447 an optional directory containing element attributes, *extra* are
448 additional attributes given as keyword arguments.
451 attrib = attrib.copy()
453 element = parent.makeelement(tag, attrib)
454 parent.append(element)
458 def Comment(text=None):
459 """Comment element factory.
461 This function creates a special element which the standard serializer
462 serializes as an XML comment.
464 *text* is a string containing the comment string.
467 element = Element(Comment)
472 def ProcessingInstruction(target, text=None):
473 """Processing Instruction element factory.
475 This function creates a special element which the standard serializer
476 serializes as an XML comment.
478 *target* is a string containing the processing instruction, *text* is a
479 string containing the processing instruction contents, if any.
482 element = Element(ProcessingInstruction)
483 element.text = target
485 element.text = element.text + " " + text
488 PI = ProcessingInstruction
492 """Qualified name wrapper.
494 This class can be used to wrap a QName attribute value in order to get
495 proper namespace handing on output.
497 *text_or_uri* is a string containing the QName value either in the form
498 {uri}local, or if the tag argument is given, the URI part of a QName.
500 *tag* is an optional argument which if given, will make the first
501 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
502 be interpreted as a local name.
505 def __init__(self, text_or_uri, tag=None):
507 text_or_uri = "{%s}%s" % (text_or_uri, tag)
508 self.text = text_or_uri
512 return '<QName %r>' % (self.text,)
514 return hash(self.text)
515 def __le__(self, other):
516 if isinstance(other, QName):
517 return self.text <= other.text
518 return self.text <= other
519 def __lt__(self, other):
520 if isinstance(other, QName):
521 return self.text < other.text
522 return self.text < other
523 def __ge__(self, other):
524 if isinstance(other, QName):
525 return self.text >= other.text
526 return self.text >= other
527 def __gt__(self, other):
528 if isinstance(other, QName):
529 return self.text > other.text
530 return self.text > other
531 def __eq__(self, other):
532 if isinstance(other, QName):
533 return self.text == other.text
534 return self.text == other
535 def __ne__(self, other):
536 if isinstance(other, QName):
537 return self.text != other.text
538 return self.text != other
540 # --------------------------------------------------------------------
544 """An XML element hierarchy.
546 This class also provides support for serialization to and from
549 *element* is an optional root element node,
550 *file* is an optional file handle or file name of an XML file whose
551 contents will be used to initialize the tree with.
554 def __init__(self, element=None, file=None):
555 # assert element is None or iselement(element)
556 self._root = element # first node
561 """Return root element of this tree."""
564 def _setroot(self, element):
565 """Replace root element of this tree.
567 This will discard the current contents of the tree and replace it
568 with the given element. Use with care!
571 # assert iselement(element)
574 def parse(self, source, parser=None):
575 """Load external XML document into element tree.
577 *source* is a file name or file object, *parser* is an optional parser
578 instance that defaults to XMLParser.
580 ParseError is raised if the parser fails to parse the document.
582 Returns the root element of the given source document.
586 if not hasattr(source, "read"):
587 source = open(source, "rb")
591 # If no parser was specified, create a default XMLParser
593 if hasattr(parser, '_parse_whole'):
594 # The default XMLParser, when it comes from an accelerator,
595 # can define an internal _parse_whole API for efficiency.
596 # It can be used to parse the whole source without feeding
598 self._root = parser._parse_whole(source)
601 data = source.read(65536)
605 self._root = parser.close()
611 def iter(self, tag=None):
612 """Create and return tree iterator for the root element.
614 The iterator loops over all elements in this tree, in document order.
616 *tag* is a string with the tag name to iterate over
617 (default is to return all elements).
620 # assert self._root is not None
621 return self._root.iter(tag)
624 def getiterator(self, tag=None):
625 # Change for a DeprecationWarning in 1.4
627 "This method will be removed in future versions. "
628 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
629 PendingDeprecationWarning, stacklevel=2
631 return list(self.iter(tag))
633 def find(self, path, namespaces=None):
634 """Find first matching element by tag name or path.
636 Same as getroot().find(path), which is Element.find()
638 *path* is a string having either an element tag or an XPath,
639 *namespaces* is an optional mapping from namespace prefix to full name.
641 Return the first matching element, or None if no element was found.
644 # assert self._root is not None
648 "This search is broken in 1.3 and earlier, and will be "
649 "fixed in a future version. If you rely on the current "
650 "behaviour, change it to %r" % path,
651 FutureWarning, stacklevel=2
653 return self._root.find(path, namespaces)
655 def findtext(self, path, default=None, namespaces=None):
656 """Find first matching element by tag name or path.
658 Same as getroot().findtext(path), which is Element.findtext()
660 *path* is a string having either an element tag or an XPath,
661 *namespaces* is an optional mapping from namespace prefix to full name.
663 Return the first matching element, or None if no element was found.
666 # assert self._root is not None
670 "This search is broken in 1.3 and earlier, and will be "
671 "fixed in a future version. If you rely on the current "
672 "behaviour, change it to %r" % path,
673 FutureWarning, stacklevel=2
675 return self._root.findtext(path, default, namespaces)
677 def findall(self, path, namespaces=None):
678 """Find all matching subelements by tag name or path.
680 Same as getroot().findall(path), which is Element.findall().
682 *path* is a string having either an element tag or an XPath,
683 *namespaces* is an optional mapping from namespace prefix to full name.
685 Return list containing all matching elements in document order.
688 # assert self._root is not None
692 "This search is broken in 1.3 and earlier, and will be "
693 "fixed in a future version. If you rely on the current "
694 "behaviour, change it to %r" % path,
695 FutureWarning, stacklevel=2
697 return self._root.findall(path, namespaces)
699 def iterfind(self, path, namespaces=None):
700 """Find all matching subelements by tag name or path.
702 Same as getroot().iterfind(path), which is element.iterfind()
704 *path* is a string having either an element tag or an XPath,
705 *namespaces* is an optional mapping from namespace prefix to full name.
707 Return an iterable yielding all matching elements in document order.
710 # assert self._root is not None
714 "This search is broken in 1.3 and earlier, and will be "
715 "fixed in a future version. If you rely on the current "
716 "behaviour, change it to %r" % path,
717 FutureWarning, stacklevel=2
719 return self._root.iterfind(path, namespaces)
721 def write(self, file_or_filename,
723 xml_declaration=None,
724 default_namespace=None,
726 short_empty_elements=True):
727 """Write element tree to a file as XML.
730 *file_or_filename* -- file name or a file object opened for writing
732 *encoding* -- the output encoding (default: US-ASCII)
734 *xml_declaration* -- bool indicating if an XML declaration should be
735 added to the output. If None, an XML declaration
736 is added if encoding IS NOT either of:
737 US-ASCII, UTF-8, or Unicode
739 *default_namespace* -- sets the default XML namespace (for "xmlns")
741 *method* -- either "xml" (default), "html, "text", or "c14n"
743 *short_empty_elements* -- controls the formatting of elements
744 that contain no content. If True (default)
745 they are emitted as a single self-closed
746 tag, otherwise they are emitted as a pair
752 elif method not in _serialize:
753 raise ValueError("unknown method %r" % method)
758 encoding = "us-ascii"
759 enc_lower = encoding.lower()
760 with _get_writer(file_or_filename, enc_lower) as write:
761 if method == "xml" and (xml_declaration or
762 (xml_declaration is None and
763 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
764 declared_encoding = encoding
765 if enc_lower == "unicode":
766 # Retrieve the default encoding for the xml declaration
768 declared_encoding = locale.getpreferredencoding()
769 write("<?xml version='1.0' encoding='%s'?>\n" % (
772 _serialize_text(write, self._root)
774 qnames, namespaces = _namespaces(self._root, default_namespace)
775 serialize = _serialize[method]
776 serialize(write, self._root, qnames, namespaces,
777 short_empty_elements=short_empty_elements)
779 def write_c14n(self, file):
780 # lxml.etree compatibility. use output method instead
781 return self.write(file, method="c14n")
783 # --------------------------------------------------------------------
784 # serialization support
786 @contextlib.contextmanager
787 def _get_writer(file_or_filename, encoding):
788 # returns text write method and release all resources after using
790 write = file_or_filename.write
791 except AttributeError:
792 # file_or_filename is a file name
793 if encoding == "unicode":
794 file = open(file_or_filename, "w")
796 file = open(file_or_filename, "w", encoding=encoding,
797 errors="xmlcharrefreplace")
801 # file_or_filename is a file-like object
802 # encoding determines if it is a text or binary writer
803 if encoding == "unicode":
804 # use a text writer as is
807 # wrap a binary writer with TextIOWrapper
808 with contextlib.ExitStack() as stack:
809 if isinstance(file_or_filename, io.BufferedIOBase):
810 file = file_or_filename
811 elif isinstance(file_or_filename, io.RawIOBase):
812 file = io.BufferedWriter(file_or_filename)
813 # Keep the original file open when the BufferedWriter is
815 stack.callback(file.detach)
817 # This is to handle passed objects that aren't in the
818 # IOBase hierarchy, but just have a write method
819 file = io.BufferedIOBase()
820 file.writable = lambda: True
823 # TextIOWrapper uses this methods to determine
824 # if BOM (for UTF-16, etc) should be added
825 file.seekable = file_or_filename.seekable
826 file.tell = file_or_filename.tell
827 except AttributeError:
829 file = io.TextIOWrapper(file,
831 errors="xmlcharrefreplace",
833 # Keep the original file open when the TextIOWrapper is
835 stack.callback(file.detach)
838 def _namespaces(elem, default_namespace=None):
839 # identify namespaces used in this tree
841 # maps qnames to *encoded* prefix:local names
842 qnames = {None: None}
844 # maps uri:s to prefixes
846 if default_namespace:
847 namespaces[default_namespace] = ""
849 def add_qname(qname):
850 # calculate serialized qname representation
853 uri, tag = qname[1:].rsplit("}", 1)
854 prefix = namespaces.get(uri)
856 prefix = _namespace_map.get(uri)
858 prefix = "ns%d" % len(namespaces)
860 namespaces[uri] = prefix
862 qnames[qname] = "%s:%s" % (prefix, tag)
864 qnames[qname] = tag # default element
866 if default_namespace:
867 # FIXME: can this be handled in XML 1.0?
869 "cannot use non-qualified names with "
870 "default_namespace option"
872 qnames[qname] = qname
874 _raise_serialization_error(qname)
876 # populate qname and namespaces table
877 for elem in elem.iter():
879 if isinstance(tag, QName):
880 if tag.text not in qnames:
882 elif isinstance(tag, str):
883 if tag not in qnames:
885 elif tag is not None and tag is not Comment and tag is not PI:
886 _raise_serialization_error(tag)
887 for key, value in elem.items():
888 if isinstance(key, QName):
890 if key not in qnames:
892 if isinstance(value, QName) and value.text not in qnames:
893 add_qname(value.text)
895 if isinstance(text, QName) and text.text not in qnames:
897 return qnames, namespaces
899 def _serialize_xml(write, elem, qnames, namespaces,
900 short_empty_elements, **kwargs):
904 write("<!--%s-->" % text)
905 elif tag is ProcessingInstruction:
906 write("<?%s?>" % text)
911 write(_escape_cdata(text))
913 _serialize_xml(write, e, qnames, None,
914 short_empty_elements=short_empty_elements)
917 items = list(elem.items())
918 if items or namespaces:
920 for v, k in sorted(namespaces.items(),
921 key=lambda x: x[1]): # sort on prefix
924 write(" xmlns%s=\"%s\"" % (
928 for k, v in sorted(items): # lexical order
929 if isinstance(k, QName):
931 if isinstance(v, QName):
934 v = _escape_attrib(v)
935 write(" %s=\"%s\"" % (qnames[k], v))
936 if text or len(elem) or not short_empty_elements:
939 write(_escape_cdata(text))
941 _serialize_xml(write, e, qnames, None,
942 short_empty_elements=short_empty_elements)
943 write("</" + tag + ">")
947 write(_escape_cdata(elem.tail))
949 # add from cvw jan 2019
950 def _serialize_pretty_xml(write, elem, qnames, namespaces,
951 short_empty_elements, indent=0):
952 # print("*****pretty***** indent", elem.tag, indent)
956 write("<!--%s-->" % text)
957 elif tag is ProcessingInstruction:
958 write("<?%s?>" % text)
963 write(_escape_cdata(text))
965 _serialize_pretty_xml(write, e, qnames, None,
966 short_empty_elements=short_empty_elements, indent=indent)
968 write(" "*indent + "<" + tag)
969 items = list(elem.items())
970 if items or namespaces:
972 for v, k in sorted(namespaces.items(),
973 key=lambda x: x[1]): # sort on prefix
976 write(" xmlns%s=\"%s\"" % (
980 for k, v in sorted(items): # lexical order
981 # print("atrrib ", k, v)
982 if isinstance(k, QName):
984 if isinstance(v, QName):
987 v = _escape_attrib(v)
988 write(" %s=\"%s\"" % (qnames[k], v))
989 if text or len(elem) or not short_empty_elements:
992 write(_escape_cdata(text))
997 _serialize_pretty_xml(write, e, qnames, None,
998 short_empty_elements=short_empty_elements, indent=indent+2)
999 write(" "*indent + "</" + tag + ">\n")
1003 write(_escape_cdata(elem.tail))
1006 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
1007 "img", "input", "isindex", "link", "meta", "param")
1010 HTML_EMPTY = set(HTML_EMPTY)
1014 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
1018 write("<!--%s-->" % _escape_cdata(text))
1019 elif tag is ProcessingInstruction:
1020 write("<?%s?>" % _escape_cdata(text))
1025 write(_escape_cdata(text))
1027 _serialize_html(write, e, qnames, None)
1030 items = list(elem.items())
1031 if items or namespaces:
1033 for v, k in sorted(namespaces.items(),
1034 key=lambda x: x[1]): # sort on prefix
1037 write(" xmlns%s=\"%s\"" % (
1041 for k, v in sorted(items): # lexical order
1042 if isinstance(k, QName):
1044 if isinstance(v, QName):
1047 v = _escape_attrib_html(v)
1048 # FIXME: handle boolean attributes
1049 write(" %s=\"%s\"" % (qnames[k], v))
1053 if ltag == "script" or ltag == "style":
1056 write(_escape_cdata(text))
1058 _serialize_html(write, e, qnames, None)
1059 if ltag not in HTML_EMPTY:
1060 write("</" + tag + ">")
1062 write(_escape_cdata(elem.tail))
1064 def _serialize_text(write, elem):
1065 for part in elem.itertext():
1071 "xml": _serialize_xml,
1072 "pretty_xml": _serialize_pretty_xml,
1073 "html": _serialize_html,
1074 "text": _serialize_text,
1075 # this optional method is imported at the end of the module
1076 # "c14n": _serialize_c14n,
1080 def register_namespace(prefix, uri):
1081 """Register a namespace prefix.
1083 The registry is global, and any existing mapping for either the
1084 given prefix or the namespace URI will be removed.
1086 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1087 attributes in this namespace will be serialized with prefix if possible.
1089 ValueError is raised if prefix is reserved or is invalid.
1092 if re.match("ns\d+$", prefix):
1093 raise ValueError("Prefix format reserved for internal use")
1094 for k, v in list(_namespace_map.items()):
1095 if k == uri or v == prefix:
1096 del _namespace_map[k]
1097 _namespace_map[uri] = prefix
1100 # "well-known" namespace prefixes
1101 "http://www.w3.org/XML/1998/namespace": "xml",
1102 "http://www.w3.org/1999/xhtml": "html",
1103 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1104 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1106 "http://www.w3.org/2001/XMLSchema": "xs",
1107 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1109 "http://purl.org/dc/elements/1.1/": "dc",
1111 # For tests and troubleshooting
1112 register_namespace._namespace_map = _namespace_map
1114 def _raise_serialization_error(text):
1116 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1119 def _escape_cdata(text):
1120 # escape character data
1122 # it's worth avoiding do-nothing calls for strings that are
1123 # shorter than 500 character, or so. assume that's, by far,
1124 # the most common case in most applications.
1126 text = text.replace("&", "&")
1128 text = text.replace("<", "<")
1130 text = text.replace(">", ">")
1132 except (TypeError, AttributeError):
1133 _raise_serialization_error(text)
1135 def _escape_attrib(text):
1136 # escape attribute value
1139 text = text.replace("&", "&")
1141 text = text.replace("<", "<")
1143 text = text.replace(">", ">")
1145 text = text.replace("\"", """)
1147 text = text.replace("\n", " ")
1149 except (TypeError, AttributeError):
1150 _raise_serialization_error(text)
1152 def _escape_attrib_html(text):
1153 # escape attribute value
1156 text = text.replace("&", "&")
1158 text = text.replace(">", ">")
1160 text = text.replace("\"", """)
1162 except (TypeError, AttributeError):
1163 _raise_serialization_error(text)
1165 # --------------------------------------------------------------------
1167 def tostring(element, encoding=None, method=None, *,
1168 short_empty_elements=True):
1169 """Generate string representation of XML element.
1171 All subelements are included. If encoding is "unicode", a string
1172 is returned. Otherwise a bytestring is returned.
1174 *element* is an Element instance, *encoding* is an optional output
1175 encoding defaulting to US-ASCII, *method* is an optional output which can
1176 be one of "xml" (default), "html", "text" or "c14n".
1178 Returns an (optionally) encoded string containing the XML data.
1181 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1182 ElementTree(element).write(stream, encoding, method=method,
1183 short_empty_elements=short_empty_elements)
1184 return stream.getvalue()
1186 class _ListDataStream(io.BufferedIOBase):
1187 """An auxiliary stream accumulating into a list reference."""
1188 def __init__(self, lst):
1201 return len(self.lst)
1203 def tostringlist(element, encoding=None, method=None, *,
1204 short_empty_elements=True):
1206 stream = _ListDataStream(lst)
1207 ElementTree(element).write(stream, encoding, method=method,
1208 short_empty_elements=short_empty_elements)
1213 """Write element tree or element structure to sys.stdout.
1215 This function should be used for debugging only.
1217 *elem* is either an ElementTree, or a single Element. The exact output
1218 format is implementation dependent. In this version, it's written as an
1223 if not isinstance(elem, ElementTree):
1224 elem = ElementTree(elem)
1225 elem.write(sys.stdout, encoding="unicode")
1226 tail = elem.getroot().tail
1227 if not tail or tail[-1] != "\n":
1228 sys.stdout.write("\n")
1230 # --------------------------------------------------------------------
1234 def parse(source, parser=None):
1235 """Parse XML document into element tree.
1237 *source* is a filename or file object containing XML data,
1238 *parser* is an optional parser instance defaulting to XMLParser.
1240 Return an ElementTree instance.
1243 tree = ElementTree()
1244 tree.parse(source, parser)
1248 def iterparse(source, events=None, parser=None):
1249 """Incrementally parse XML document into ElementTree.
1251 This class also reports what's going on to the user based on the
1252 *events* it is initialized with. The supported events are the strings
1253 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1254 detailed namespace information). If *events* is omitted, only
1255 "end" events are reported.
1257 *source* is a filename or file object containing XML data, *events* is
1258 a list of events to report back, *parser* is an optional parser instance.
1260 Returns an iterator providing (event, elem) pairs.
1263 close_source = False
1264 if not hasattr(source, "read"):
1265 source = open(source, "rb")
1268 return _IterParseIterator(source, events, parser, close_source)
1275 class XMLPullParser:
1277 def __init__(self, events=None, *, _parser=None):
1278 # The _parser argument is for internal use only and must not be relied
1279 # upon in user code. It will be removed in a future release.
1280 # See http://bugs.python.org/issue17741 for more details.
1282 # _elementtree.c expects a list, not a deque
1283 self._events_queue = []
1285 self._parser = _parser or XMLParser(target=TreeBuilder())
1286 # wire up the parser for event reporting
1289 self._parser._setevents(self._events_queue, events)
1291 def feed(self, data):
1292 """Feed encoded data to parser."""
1293 if self._parser is None:
1294 raise ValueError("feed() called after end of stream")
1297 self._parser.feed(data)
1298 except SyntaxError as exc:
1299 self._events_queue.append(exc)
1301 def _close_and_return_root(self):
1302 # iterparse needs this to set its root attribute properly :(
1303 root = self._parser.close()
1308 """Finish feeding data to parser.
1310 Unlike XMLParser, does not return the root element. Use
1311 read_events() to consume elements from XMLPullParser.
1313 self._close_and_return_root()
1315 def read_events(self):
1316 """Return an iterator over currently available (event, elem) pairs.
1318 Events are consumed from the internal event queue as they are
1319 retrieved from the iterator.
1321 events = self._events_queue
1325 event = events[self._index]
1326 # Avoid retaining references to past events
1327 events[self._index] = None
1331 # Compact the list in a O(1) amortized fashion
1332 # As noted above, _elementree.c needs a list, not a deque
1333 if index * 2 >= len(events):
1338 if isinstance(event, Exception):
1344 class _IterParseIterator:
1346 def __init__(self, source, events, parser, close_source=False):
1347 # Use the internal, undocumented _parser argument for now; When the
1348 # parser argument of iterparse is removed, this can be killed.
1349 self._parser = XMLPullParser(events=events, _parser=parser)
1351 self._close_file = close_source
1352 self.root = self._root = None
1357 for event in self._parser.read_events():
1359 if self._parser._parser is None:
1362 data = self._file.read(16 * 1024)
1364 self._parser.feed(data)
1366 self._root = self._parser._close_and_return_root()
1367 self.root = self._root
1369 if self._close_file:
1372 if self._close_file:
1380 def XML(text, parser=None):
1381 """Parse XML document from string constant.
1383 This function can be used to embed "XML Literals" in Python code.
1385 *text* is a string containing XML data, *parser* is an
1386 optional parser instance, defaulting to the standard XMLParser.
1388 Returns an Element instance.
1392 parser = XMLParser(target=TreeBuilder())
1394 return parser.close()
1397 def XMLID(text, parser=None):
1398 """Parse XML document from string constant for its IDs.
1400 *text* is a string containing XML data, *parser* is an
1401 optional parser instance, defaulting to the standard XMLParser.
1403 Returns an (Element, dict) tuple, in which the
1404 dict maps element id:s to elements.
1408 parser = XMLParser(target=TreeBuilder())
1410 tree = parser.close()
1412 for elem in tree.iter():
1418 # Parse XML document from string constant. Alias for XML().
1421 def fromstringlist(sequence, parser=None):
1422 """Parse XML document from sequence of string fragments.
1424 *sequence* is a list of other sequence, *parser* is an optional parser
1425 instance, defaulting to the standard XMLParser.
1427 Returns an Element instance.
1431 parser = XMLParser(target=TreeBuilder())
1432 for text in sequence:
1434 return parser.close()
1436 # --------------------------------------------------------------------
1440 """Generic element structure builder.
1442 This builder converts a sequence of start, data, and end method
1443 calls to a well-formed element structure.
1445 You can use this class to build an element structure using a custom XML
1446 parser, or a parser for some other XML-like format.
1448 *element_factory* is an optional element factory which is called
1449 to create new Element instances, as necessary.
1452 def __init__(self, element_factory=None):
1453 self._data = [] # data collector
1454 self._elem = [] # element stack
1455 self._last = None # last element
1456 self._tail = None # true if we're after an end tag
1457 if element_factory is None:
1458 element_factory = Element
1459 self._factory = element_factory
1462 """Flush builder buffers and return toplevel document Element."""
1463 assert len(self._elem) == 0, "missing end tags"
1464 assert self._last is not None, "missing toplevel element"
1469 if self._last is not None:
1470 text = "".join(self._data)
1472 assert self._last.tail is None, "internal error (tail)"
1473 self._last.tail = text
1475 assert self._last.text is None, "internal error (text)"
1476 self._last.text = text
1479 def data(self, data):
1480 """Add text to current element."""
1481 self._data.append(data)
1483 def start(self, tag, attrs):
1484 """Open new element and return it.
1486 *tag* is the element name, *attrs* is a dict containing element
1491 self._last = elem = self._factory(tag, attrs)
1493 self._elem[-1].append(elem)
1494 self._elem.append(elem)
1499 """Close and return current Element.
1501 *tag* is the element name.
1505 self._last = self._elem.pop()
1506 assert self._last.tag == tag,\
1507 "end tag mismatch (expected %s, got %s)" % (
1508 self._last.tag, tag)
1513 # also see ElementTree and TreeBuilder
1515 """Element structure builder for XML source data based on the expat parser.
1517 *html* are predefined HTML entities (not supported currently),
1518 *target* is an optional target object which defaults to an instance of the
1519 standard TreeBuilder class, *encoding* is an optional encoding string
1520 which if given, overrides the encoding specified in the XML file:
1521 http://www.iana.org/assignments/character-sets
1525 def __init__(self, html=0, target=None, encoding=None):
1527 from xml.parsers import expat
1530 import pyexpat as expat
1533 "No module named expat; use SimpleXMLTreeBuilder instead"
1535 parser = expat.ParserCreate(encoding, "}")
1537 target = TreeBuilder()
1538 # underscored names are provided for compatibility only
1539 self.parser = self._parser = parser
1540 self.target = self._target = target
1541 self._error = expat.error
1542 self._names = {} # name memo cache
1544 parser.DefaultHandlerExpand = self._default
1545 if hasattr(target, 'start'):
1546 parser.StartElementHandler = self._start
1547 if hasattr(target, 'end'):
1548 parser.EndElementHandler = self._end
1549 if hasattr(target, 'data'):
1550 parser.CharacterDataHandler = target.data
1551 # miscellaneous callbacks
1552 if hasattr(target, 'comment'):
1553 parser.CommentHandler = target.comment
1554 if hasattr(target, 'pi'):
1555 parser.ProcessingInstructionHandler = target.pi
1556 # Configure pyexpat: buffering, new-style attribute handling.
1557 parser.buffer_text = 1
1558 parser.ordered_attributes = 1
1559 parser.specified_attributes = 1
1560 self._doctype = None
1563 self.version = "Expat %d.%d.%d" % expat.version_info
1564 except AttributeError:
1567 def _setevents(self, events_queue, events_to_report):
1568 # Internal API for XMLPullParser
1569 # events_to_report: a list of events to report during parsing (same as
1570 # the *events* of XMLPullParser's constructor.
1571 # events_queue: a list of actual parsing events that will be populated
1572 # by the underlying parser.
1574 parser = self._parser
1575 append = events_queue.append
1576 for event_name in events_to_report:
1577 if event_name == "start":
1578 parser.ordered_attributes = 1
1579 parser.specified_attributes = 1
1580 def handler(tag, attrib_in, event=event_name, append=append,
1582 append((event, start(tag, attrib_in)))
1583 parser.StartElementHandler = handler
1584 elif event_name == "end":
1585 def handler(tag, event=event_name, append=append,
1587 append((event, end(tag)))
1588 parser.EndElementHandler = handler
1589 elif event_name == "start-ns":
1590 def handler(prefix, uri, event=event_name, append=append):
1591 append((event, (prefix or "", uri or "")))
1592 parser.StartNamespaceDeclHandler = handler
1593 elif event_name == "end-ns":
1594 def handler(prefix, event=event_name, append=append):
1595 append((event, None))
1596 parser.EndNamespaceDeclHandler = handler
1598 raise ValueError("unknown event %r" % event_name)
1600 def _raiseerror(self, value):
1601 err = ParseError(value)
1602 err.code = value.code
1603 err.position = value.lineno, value.offset
1606 def _fixname(self, key):
1607 # expand qname, and convert name string to ascii, if possible
1609 name = self._names[key]
1614 self._names[key] = name
1617 def _start(self, tag, attr_list):
1618 # Handler for expat's StartElementHandler. Since ordered_attributes
1619 # is set, the attributes are reported as a list of alternating
1620 # attribute name,value.
1621 fixname = self._fixname
1625 for i in range(0, len(attr_list), 2):
1626 attrib[fixname(attr_list[i])] = attr_list[i+1]
1627 return self.target.start(tag, attrib)
1629 def _end(self, tag):
1630 return self.target.end(self._fixname(tag))
1632 def _default(self, text):
1635 # deal with undefined entities
1637 data_handler = self.target.data
1638 except AttributeError:
1641 data_handler(self.entity[text[1:-1]])
1643 from xml.parsers import expat
1645 "undefined entity %s: line %d, column %d" %
1646 (text, self.parser.ErrorLineNumber,
1647 self.parser.ErrorColumnNumber)
1649 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1650 err.lineno = self.parser.ErrorLineNumber
1651 err.offset = self.parser.ErrorColumnNumber
1653 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1654 self._doctype = [] # inside a doctype declaration
1655 elif self._doctype is not None:
1656 # parse doctype contents
1658 self._doctype = None
1663 self._doctype.append(text)
1664 n = len(self._doctype)
1666 type = self._doctype[1]
1667 if type == "PUBLIC" and n == 4:
1668 name, type, pubid, system = self._doctype
1671 elif type == "SYSTEM" and n == 3:
1672 name, type, system = self._doctype
1676 if hasattr(self.target, "doctype"):
1677 self.target.doctype(name, pubid, system[1:-1])
1678 elif self.doctype != self._XMLParser__doctype:
1679 # warn about deprecated call
1680 self._XMLParser__doctype(name, pubid, system[1:-1])
1681 self.doctype(name, pubid, system[1:-1])
1682 self._doctype = None
1684 def doctype(self, name, pubid, system):
1685 """(Deprecated) Handle doctype declaration
1687 *name* is the Doctype name, *pubid* is the public identifier,
1688 and *system* is the system identifier.
1692 "This method of XMLParser is deprecated. Define doctype() "
1693 "method on the TreeBuilder target.",
1697 # sentinel, if doctype is redefined in a subclass
1700 def feed(self, data):
1701 """Feed encoded data to parser."""
1703 self.parser.Parse(data, 0)
1704 except self._error as v:
1708 """Finish feeding data to parser and return element structure."""
1710 self.parser.Parse("", 1) # end of data
1711 except self._error as v:
1714 close_handler = self.target.close
1715 except AttributeError:
1718 return close_handler()
1720 # get rid of circular references
1721 del self.parser, self._parser
1722 del self.target, self._target
1725 # Import the C accelerators
1727 # Element is going to be shadowed by the C implementation. We need to keep
1728 # the Python version of it accessible for some "creative" by external code
1730 _Element_Py = Element
1732 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1733 from _elementtree import *