1 """Lightweight XML support for Python.
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
6 1. ElementTree represents the whole XML document as a tree and
8 2. Element represents a single node in this tree.
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
18 'tag' - a string containing the element's name.
20 'attributes' - a Python dictionary storing the element's attributes.
22 'text' - a string containing the element's text content.
24 'tail' - an optional string containing text after the element's end tag.
26 And a number of child elements stored in a Python sequence.
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
36 # ---------------------------------------------------------------------
37 # Licensed to PSF under a Contributor Agreement.
38 # See http://www.python.org/psf/license for licensing details.
41 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
43 # fredrik@pythonware.com
44 # http://www.pythonware.com
45 # --------------------------------------------------------------------
46 # The ElementTree toolkit is
48 # Copyright (c) 1999-2008 by Fredrik Lundh
50 # By obtaining, using, and/or copying this software and/or its
51 # associated documentation, you agree that you have read, understood,
52 # and will comply with the following terms and conditions:
54 # Permission to use, copy, modify, and distribute this software and
55 # its associated documentation for any purpose and without fee is
56 # hereby granted, provided that the above copyright notice appears in
57 # all copies, and that both that copyright notice and this permission
58 # notice appear in supporting documentation, and that the name of
59 # Secret Labs AB or the author not be used in advertising or publicity
60 # pertaining to distribution of the software without specific, written
63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
71 # --------------------------------------------------------------------
86 "ProcessingInstruction",
110 class ParseError(SyntaxError):
111 """An error when parsing an XML document.
113 In addition to its exception value, a ParseError contains
114 two extra attributes:
115 'code' - the specific exception code
116 'position' - the line and column of the error
123 # --------------------------------------------------------------------
126 def iselement(element):
127 """Return True if *element* appears to be an Element."""
128 return hasattr(element, "tag")
134 This class is the reference implementation of the Element interface.
136 An element's length is its number of subelements. That means if you
137 want to check if an element is truly empty, you should check BOTH
138 its length AND its text attribute.
140 The element tag, attribute names, and attribute values can be either
143 *tag* is the element name. *attrib* is an optional dictionary containing
144 element attributes. *extra* are additional element attributes given as
148 <tag attrib>text<child/>...</tag>tail
153 """The element's name."""
156 """Dictionary of the element's attributes."""
160 Text before first subelement. This is either a string or the value None.
161 Note that if there is no text, this attribute may be either
162 None or the empty string, depending on the parser.
168 Text after this element's end tag, but before the next sibling element's
169 start tag. This is either a string or the value None. Note that if there
170 was no text, this attribute may be either None or an empty string,
171 depending on the parser.
175 def __init__(self, tag, attrib={}, **extra):
176 if not isinstance(attrib, dict):
178 "attrib must be dict, not %s" % (attrib.__class__.__name__,)
180 attrib = attrib.copy()
187 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
189 def makeelement(self, tag, attrib):
190 """Create a new element with the same type.
192 *tag* is a string containing the element name.
193 *attrib* is a dictionary containing the element attributes.
195 Do not call this method, use the SubElement factory function instead.
198 return self.__class__(tag, attrib)
201 """Return copy of current element.
203 This creates a shallow copy. Subelements will be shared with the
207 elem = self.makeelement(self.tag, self.attrib)
208 elem.text = self.text
209 elem.tail = self.tail
214 return len(self._children)
218 "The behavior of this method will change in future versions. "
219 "Use specific 'len(elem)' or 'elem is not None' test instead.",
223 return len(self._children) != 0 # emulate old behaviour, for now
225 def __getitem__(self, index):
226 return self._children[index]
228 def __setitem__(self, index, element):
229 # if isinstance(index, slice):
230 # for elt in element:
231 # assert iselement(elt)
233 # assert iselement(element)
234 self._children[index] = element
236 def __delitem__(self, index):
237 del self._children[index]
239 def append(self, subelement):
240 """Add *subelement* to the end of this element.
242 The new element will appear in document order after the last existing
243 subelement (or directly after the text, if it's the first subelement),
244 but before the end tag for this element.
247 self._assert_is_element(subelement)
248 self._children.append(subelement)
250 def extend(self, elements):
251 """Append subelements from a sequence.
253 *elements* is a sequence with zero or more elements.
256 for element in elements:
257 self._assert_is_element(element)
258 self._children.extend(elements)
260 def insert(self, index, subelement):
261 """Insert *subelement* at position *index*."""
262 self._assert_is_element(subelement)
263 self._children.insert(index, subelement)
265 def _assert_is_element(self, e):
266 # Need to refer to the actual Python implementation, not the
267 # shadowing C implementation.
268 if not isinstance(e, _Element_Py):
269 raise TypeError("expected an Element, not %s" % type(e).__name__)
271 def remove(self, subelement):
272 """Remove matching subelement.
274 Unlike the find methods, this method compares elements based on
275 identity, NOT ON tag value or contents. To remove subelements by
276 other means, the easiest way is to use a list comprehension to
277 select what elements to keep, and then use slice assignment to update
280 ValueError is raised if a matching element could not be found.
283 # assert iselement(element)
284 self._children.remove(subelement)
286 def getchildren(self):
287 """(Deprecated) Return all subelements.
289 Elements are returned in document order.
293 "This method will be removed in future versions. "
294 "Use 'list(elem)' or iteration over elem instead.",
298 return self._children
300 def find(self, path, namespaces=None):
301 """Find first matching element by tag name or path.
303 *path* is a string having either an element tag or an XPath,
304 *namespaces* is an optional mapping from namespace prefix to full name.
306 Return the first matching element, or None if no element was found.
309 return ElementPath.find(self, path, namespaces)
311 def findtext(self, path, default=None, namespaces=None):
312 """Find text for first matching element by tag name or path.
314 *path* is a string having either an element tag or an XPath,
315 *default* is the value to return if the element was not found,
316 *namespaces* is an optional mapping from namespace prefix to full name.
318 Return text content of first matching element, or default value if
319 none was found. Note that if an element is found having no text
320 content, the empty string is returned.
323 return ElementPath.findtext(self, path, default, namespaces)
325 def findall(self, path, namespaces=None):
326 """Find all matching subelements by tag name or path.
328 *path* is a string having either an element tag or an XPath,
329 *namespaces* is an optional mapping from namespace prefix to full name.
331 Returns list containing all matching elements in document order.
334 return ElementPath.findall(self, path, namespaces)
336 def iterfind(self, path, namespaces=None):
337 """Find all matching subelements by tag name or path.
339 *path* is a string having either an element tag or an XPath,
340 *namespaces* is an optional mapping from namespace prefix to full name.
342 Return an iterable yielding all matching elements in document order.
345 return ElementPath.iterfind(self, path, namespaces)
350 This function removes all subelements, clears all attributes, and sets
351 the text and tail attributes to None.
356 self.text = self.tail = None
358 def get(self, key, default=None):
359 """Get element attribute.
361 Equivalent to attrib.get, but some implementations may handle this a
362 bit more efficiently. *key* is what attribute to look for, and
363 *default* is what to return if the attribute was not found.
365 Returns a string containing the attribute value, or the default if
366 attribute was not found.
369 return self.attrib.get(key, default)
371 def set(self, key, value):
372 """Set element attribute.
374 Equivalent to attrib[key] = value, but some implementations may handle
375 this a bit more efficiently. *key* is what attribute to set, and
376 *value* is the attribute value to set it to.
379 self.attrib[key] = value
382 """Get list of attribute names.
384 Names are returned in an arbitrary order, just like an ordinary
385 Python dict. Equivalent to attrib.keys()
388 return self.attrib.keys()
391 """Get element attributes as a sequence.
393 The attributes are returned in arbitrary order. Equivalent to
396 Return a list of (name, value) tuples.
399 return self.attrib.items()
401 def iter(self, tag=None):
402 """Create tree iterator.
404 The iterator loops over the element and all subelements in document
405 order, returning all elements with a matching tag.
407 If the tree structure is modified during iteration, new or removed
408 elements may or may not be included. To get a stable set, use the
409 list() function on the iterator, and loop over the resulting list.
411 *tag* is what tags to look for (default is to return all elements)
413 Return an iterator containing all the matching elements.
418 if tag is None or self.tag == tag:
420 for e in self._children:
421 yield from e.iter(tag)
424 def getiterator(self, tag=None):
425 # Change for a DeprecationWarning in 1.4
427 "This method will be removed in future versions. "
428 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
429 PendingDeprecationWarning,
432 return list(self.iter(tag))
435 """Create text iterator.
437 The iterator loops over the element and all subelements in document
438 order, returning all inner text.
442 if not isinstance(tag, str) and tag is not None:
447 yield from e.itertext()
452 def SubElement(parent, tag, attrib={}, **extra):
453 """Subelement factory which creates an element instance, and appends it
454 to an existing parent.
456 The element tag, attribute names, and attribute values can be either
457 bytes or Unicode strings.
459 *parent* is the parent element, *tag* is the subelements name, *attrib* is
460 an optional directory containing element attributes, *extra* are
461 additional attributes given as keyword arguments.
464 attrib = attrib.copy()
466 element = parent.makeelement(tag, attrib)
467 parent.append(element)
471 def Comment(text=None):
472 """Comment element factory.
474 This function creates a special element which the standard serializer
475 serializes as an XML comment.
477 *text* is a string containing the comment string.
480 element = Element(Comment)
485 def ProcessingInstruction(target, text=None):
486 """Processing Instruction element factory.
488 This function creates a special element which the standard serializer
489 serializes as an XML comment.
491 *target* is a string containing the processing instruction, *text* is a
492 string containing the processing instruction contents, if any.
495 element = Element(ProcessingInstruction)
496 element.text = target
498 element.text = element.text + " " + text
502 PI = ProcessingInstruction
506 """Qualified name wrapper.
508 This class can be used to wrap a QName attribute value in order to get
509 proper namespace handing on output.
511 *text_or_uri* is a string containing the QName value either in the form
512 {uri}local, or if the tag argument is given, the URI part of a QName.
514 *tag* is an optional argument which if given, will make the first
515 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
516 be interpreted as a local name.
520 def __init__(self, text_or_uri, tag=None):
522 text_or_uri = "{%s}%s" % (text_or_uri, tag)
523 self.text = text_or_uri
529 return "<QName %r>" % (self.text,)
532 return hash(self.text)
534 def __le__(self, other):
535 if isinstance(other, QName):
536 return self.text <= other.text
537 return self.text <= other
539 def __lt__(self, other):
540 if isinstance(other, QName):
541 return self.text < other.text
542 return self.text < other
544 def __ge__(self, other):
545 if isinstance(other, QName):
546 return self.text >= other.text
547 return self.text >= other
549 def __gt__(self, other):
550 if isinstance(other, QName):
551 return self.text > other.text
552 return self.text > other
554 def __eq__(self, other):
555 if isinstance(other, QName):
556 return self.text == other.text
557 return self.text == other
559 def __ne__(self, other):
560 if isinstance(other, QName):
561 return self.text != other.text
562 return self.text != other
565 # --------------------------------------------------------------------
569 """An XML element hierarchy.
571 This class also provides support for serialization to and from
574 *element* is an optional root element node,
575 *file* is an optional file handle or file name of an XML file whose
576 contents will be used to initialize the tree with.
580 def __init__(self, element=None, file=None):
581 # assert element is None or iselement(element)
582 self._root = element # first node
587 """Return root element of this tree."""
590 def _setroot(self, element):
591 """Replace root element of this tree.
593 This will discard the current contents of the tree and replace it
594 with the given element. Use with care!
597 # assert iselement(element)
600 def parse(self, source, parser=None):
601 """Load external XML document into element tree.
603 *source* is a file name or file object, *parser* is an optional parser
604 instance that defaults to XMLParser.
606 ParseError is raised if the parser fails to parse the document.
608 Returns the root element of the given source document.
612 if not hasattr(source, "read"):
613 source = open(source, "rb")
617 # If no parser was specified, create a default XMLParser
619 if hasattr(parser, "_parse_whole"):
620 # The default XMLParser, when it comes from an accelerator,
621 # can define an internal _parse_whole API for efficiency.
622 # It can be used to parse the whole source without feeding
624 self._root = parser._parse_whole(source)
627 data = source.read(65536)
631 self._root = parser.close()
637 def iter(self, tag=None):
638 """Create and return tree iterator for the root element.
640 The iterator loops over all elements in this tree, in document order.
642 *tag* is a string with the tag name to iterate over
643 (default is to return all elements).
646 # assert self._root is not None
647 return self._root.iter(tag)
650 def getiterator(self, tag=None):
651 # Change for a DeprecationWarning in 1.4
653 "This method will be removed in future versions. "
654 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
655 PendingDeprecationWarning,
658 return list(self.iter(tag))
660 def find(self, path, namespaces=None):
661 """Find first matching element by tag name or path.
663 Same as getroot().find(path), which is Element.find()
665 *path* is a string having either an element tag or an XPath,
666 *namespaces* is an optional mapping from namespace prefix to full name.
668 Return the first matching element, or None if no element was found.
671 # assert self._root is not None
675 "This search is broken in 1.3 and earlier, and will be "
676 "fixed in a future version. If you rely on the current "
677 "behaviour, change it to %r" % path,
681 return self._root.find(path, namespaces)
683 def findtext(self, path, default=None, namespaces=None):
684 """Find first matching element by tag name or path.
686 Same as getroot().findtext(path), which is Element.findtext()
688 *path* is a string having either an element tag or an XPath,
689 *namespaces* is an optional mapping from namespace prefix to full name.
691 Return the first matching element, or None if no element was found.
694 # assert self._root is not None
698 "This search is broken in 1.3 and earlier, and will be "
699 "fixed in a future version. If you rely on the current "
700 "behaviour, change it to %r" % path,
704 return self._root.findtext(path, default, namespaces)
706 def findall(self, path, namespaces=None):
707 """Find all matching subelements by tag name or path.
709 Same as getroot().findall(path), which is Element.findall().
711 *path* is a string having either an element tag or an XPath,
712 *namespaces* is an optional mapping from namespace prefix to full name.
714 Return list containing all matching elements in document order.
717 # assert self._root is not None
721 "This search is broken in 1.3 and earlier, and will be "
722 "fixed in a future version. If you rely on the current "
723 "behaviour, change it to %r" % path,
727 return self._root.findall(path, namespaces)
729 def iterfind(self, path, namespaces=None):
730 """Find all matching subelements by tag name or path.
732 Same as getroot().iterfind(path), which is element.iterfind()
734 *path* is a string having either an element tag or an XPath,
735 *namespaces* is an optional mapping from namespace prefix to full name.
737 Return an iterable yielding all matching elements in document order.
740 # assert self._root is not None
744 "This search is broken in 1.3 and earlier, and will be "
745 "fixed in a future version. If you rely on the current "
746 "behaviour, change it to %r" % path,
750 return self._root.iterfind(path, namespaces)
756 xml_declaration=None,
757 default_namespace=None,
760 short_empty_elements=True
762 """Write element tree to a file as XML.
765 *file_or_filename* -- file name or a file object opened for writing
767 *encoding* -- the output encoding (default: US-ASCII)
769 *xml_declaration* -- bool indicating if an XML declaration should be
770 added to the output. If None, an XML declaration
771 is added if encoding IS NOT either of:
772 US-ASCII, UTF-8, or Unicode
774 *default_namespace* -- sets the default XML namespace (for "xmlns")
776 *method* -- either "xml" (default), "html, "text", or "c14n"
778 *short_empty_elements* -- controls the formatting of elements
779 that contain no content. If True (default)
780 they are emitted as a single self-closed
781 tag, otherwise they are emitted as a pair
787 elif method not in _serialize:
788 raise ValueError("unknown method %r" % method)
793 encoding = "us-ascii"
794 enc_lower = encoding.lower()
795 with _get_writer(file_or_filename, enc_lower) as write:
796 if method == "xml" and (
799 xml_declaration is None
800 and enc_lower not in ("utf-8", "us-ascii", "unicode")
803 declared_encoding = encoding
804 if enc_lower == "unicode":
805 # Retrieve the default encoding for the xml declaration
808 declared_encoding = locale.getpreferredencoding()
809 write("<?xml version='1.0' encoding='%s'?>\n" % (declared_encoding,))
811 _serialize_text(write, self._root)
813 qnames, namespaces = _namespaces(self._root, default_namespace)
814 serialize = _serialize[method]
820 short_empty_elements=short_empty_elements,
823 def write_c14n(self, file):
824 # lxml.etree compatibility. use output method instead
825 return self.write(file, method="c14n")
828 # --------------------------------------------------------------------
829 # serialization support
832 @contextlib.contextmanager
833 def _get_writer(file_or_filename, encoding):
834 # returns text write method and release all resources after using
836 write = file_or_filename.write
837 except AttributeError:
838 # file_or_filename is a file name
839 if encoding == "unicode":
840 file = open(file_or_filename, "w")
843 file_or_filename, "w", encoding=encoding, errors="xmlcharrefreplace"
848 # file_or_filename is a file-like object
849 # encoding determines if it is a text or binary writer
850 if encoding == "unicode":
851 # use a text writer as is
854 # wrap a binary writer with TextIOWrapper
855 with contextlib.ExitStack() as stack:
856 if isinstance(file_or_filename, io.BufferedIOBase):
857 file = file_or_filename
858 elif isinstance(file_or_filename, io.RawIOBase):
859 file = io.BufferedWriter(file_or_filename)
860 # Keep the original file open when the BufferedWriter is
862 stack.callback(file.detach)
864 # This is to handle passed objects that aren't in the
865 # IOBase hierarchy, but just have a write method
866 file = io.BufferedIOBase()
867 file.writable = lambda: True
870 # TextIOWrapper uses this methods to determine
871 # if BOM (for UTF-16, etc) should be added
872 file.seekable = file_or_filename.seekable
873 file.tell = file_or_filename.tell
874 except AttributeError:
876 file = io.TextIOWrapper(
877 file, encoding=encoding, errors="xmlcharrefreplace", newline="\n"
879 # Keep the original file open when the TextIOWrapper is
881 stack.callback(file.detach)
885 def _namespaces(elem, default_namespace=None):
886 # identify namespaces used in this tree
888 # maps qnames to *encoded* prefix:local names
889 qnames = {None: None}
891 # maps uri:s to prefixes
893 if default_namespace:
894 namespaces[default_namespace] = ""
896 def add_qname(qname):
897 # calculate serialized qname representation
900 uri, tag = qname[1:].rsplit("}", 1)
901 prefix = namespaces.get(uri)
903 prefix = _namespace_map.get(uri)
905 prefix = "ns%d" % len(namespaces)
907 namespaces[uri] = prefix
909 qnames[qname] = "%s:%s" % (prefix, tag)
911 qnames[qname] = tag # default element
913 if default_namespace:
914 # FIXME: can this be handled in XML 1.0?
916 "cannot use non-qualified names with "
917 "default_namespace option"
919 qnames[qname] = qname
921 _raise_serialization_error(qname)
923 # populate qname and namespaces table
924 for elem in elem.iter():
926 if isinstance(tag, QName):
927 if tag.text not in qnames:
929 elif isinstance(tag, str):
930 if tag not in qnames:
932 elif tag is not None and tag is not Comment and tag is not PI:
933 _raise_serialization_error(tag)
934 for key, value in elem.items():
935 if isinstance(key, QName):
937 if key not in qnames:
939 if isinstance(value, QName) and value.text not in qnames:
940 add_qname(value.text)
942 if isinstance(text, QName) and text.text not in qnames:
944 return qnames, namespaces
947 def _serialize_xml(write, elem, qnames, namespaces, short_empty_elements, **kwargs):
951 write("<!--%s-->" % text)
952 elif tag is ProcessingInstruction:
953 write("<?%s?>" % text)
958 write(_escape_cdata(text))
961 write, e, qnames, None, short_empty_elements=short_empty_elements
965 items = list(elem.items())
966 if items or namespaces:
969 namespaces.items(), key=lambda x: x[1]
973 write(' xmlns%s="%s"' % (k, _escape_attrib(v)))
974 for k, v in sorted(items): # lexical order
975 if isinstance(k, QName):
977 if isinstance(v, QName):
980 v = _escape_attrib(v)
981 write(' %s="%s"' % (qnames[k], v))
982 if text or len(elem) or not short_empty_elements:
985 write(_escape_cdata(text))
992 short_empty_elements=short_empty_elements,
994 write("</" + tag + ">")
998 write(_escape_cdata(elem.tail))
1001 # add from cvw jan 2019
1002 def _serialize_pretty_xml(
1003 write, elem, qnames, namespaces, short_empty_elements, indent=0
1005 # print("*****pretty***** indent", elem.tag, indent)
1009 write("<!--%s-->" % text)
1010 elif tag is ProcessingInstruction:
1011 write("<?%s?>" % text)
1016 write(_escape_cdata(text))
1018 _serialize_pretty_xml(
1023 short_empty_elements=short_empty_elements,
1027 write(" " * indent + "<" + tag)
1028 items = list(elem.items())
1029 if items or namespaces:
1032 namespaces.items(), key=lambda x: x[1]
1036 write(' xmlns%s="%s"' % (k, _escape_attrib(v)))
1037 for k, v in sorted(items): # lexical order
1038 # print("atrrib ", k, v)
1039 if isinstance(k, QName):
1041 if isinstance(v, QName):
1044 v = _escape_attrib(v)
1045 write(' %s="%s"' % (qnames[k], v))
1046 if text or len(elem) or not short_empty_elements:
1049 write(_escape_cdata(text))
1054 _serialize_pretty_xml(
1059 short_empty_elements=short_empty_elements,
1062 write(" " * indent + "</" + tag + ">\n")
1066 write(_escape_cdata(elem.tail))
1086 HTML_EMPTY = set(HTML_EMPTY)
1091 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
1095 write("<!--%s-->" % _escape_cdata(text))
1096 elif tag is ProcessingInstruction:
1097 write("<?%s?>" % _escape_cdata(text))
1102 write(_escape_cdata(text))
1104 _serialize_html(write, e, qnames, None)
1107 items = list(elem.items())
1108 if items or namespaces:
1111 namespaces.items(), key=lambda x: x[1]
1115 write(' xmlns%s="%s"' % (k, _escape_attrib(v)))
1116 for k, v in sorted(items): # lexical order
1117 if isinstance(k, QName):
1119 if isinstance(v, QName):
1122 v = _escape_attrib_html(v)
1123 # FIXME: handle boolean attributes
1124 write(' %s="%s"' % (qnames[k], v))
1128 if ltag == "script" or ltag == "style":
1131 write(_escape_cdata(text))
1133 _serialize_html(write, e, qnames, None)
1134 if ltag not in HTML_EMPTY:
1135 write("</" + tag + ">")
1137 write(_escape_cdata(elem.tail))
1140 def _serialize_text(write, elem):
1141 for part in elem.itertext():
1148 "xml": _serialize_xml,
1149 "pretty_xml": _serialize_pretty_xml,
1150 "html": _serialize_html,
1151 "text": _serialize_text,
1152 # this optional method is imported at the end of the module
1153 # "c14n": _serialize_c14n,
1157 def register_namespace(prefix, uri):
1158 """Register a namespace prefix.
1160 The registry is global, and any existing mapping for either the
1161 given prefix or the namespace URI will be removed.
1163 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1164 attributes in this namespace will be serialized with prefix if possible.
1166 ValueError is raised if prefix is reserved or is invalid.
1169 if re.match("ns\d+$", prefix):
1170 raise ValueError("Prefix format reserved for internal use")
1171 for k, v in list(_namespace_map.items()):
1172 if k == uri or v == prefix:
1173 del _namespace_map[k]
1174 _namespace_map[uri] = prefix
1178 # "well-known" namespace prefixes
1179 "http://www.w3.org/XML/1998/namespace": "xml",
1180 "http://www.w3.org/1999/xhtml": "html",
1181 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1182 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1184 "http://www.w3.org/2001/XMLSchema": "xs",
1185 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1187 "http://purl.org/dc/elements/1.1/": "dc",
1189 # For tests and troubleshooting
1190 register_namespace._namespace_map = _namespace_map
1193 def _raise_serialization_error(text):
1194 raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__))
1197 def _escape_cdata(text):
1198 # escape character data
1200 # it's worth avoiding do-nothing calls for strings that are
1201 # shorter than 500 character, or so. assume that's, by far,
1202 # the most common case in most applications.
1204 text = text.replace("&", "&")
1206 text = text.replace("<", "<")
1208 text = text.replace(">", ">")
1210 except (TypeError, AttributeError):
1211 _raise_serialization_error(text)
1214 def _escape_attrib(text):
1215 # escape attribute value
1218 text = text.replace("&", "&")
1220 text = text.replace("<", "<")
1222 text = text.replace(">", ">")
1224 text = text.replace('"', """)
1226 text = text.replace("\n", " ")
1228 except (TypeError, AttributeError):
1229 _raise_serialization_error(text)
1232 def _escape_attrib_html(text):
1233 # escape attribute value
1236 text = text.replace("&", "&")
1238 text = text.replace(">", ">")
1240 text = text.replace('"', """)
1242 except (TypeError, AttributeError):
1243 _raise_serialization_error(text)
1246 # --------------------------------------------------------------------
1249 def tostring(element, encoding=None, method=None, *, short_empty_elements=True):
1250 """Generate string representation of XML element.
1252 All subelements are included. If encoding is "unicode", a string
1253 is returned. Otherwise a bytestring is returned.
1255 *element* is an Element instance, *encoding* is an optional output
1256 encoding defaulting to US-ASCII, *method* is an optional output which can
1257 be one of "xml" (default), "html", "text" or "c14n".
1259 Returns an (optionally) encoded string containing the XML data.
1262 stream = io.StringIO() if encoding == "unicode" else io.BytesIO()
1263 ElementTree(element).write(
1264 stream, encoding, method=method, short_empty_elements=short_empty_elements
1266 return stream.getvalue()
1269 class _ListDataStream(io.BufferedIOBase):
1270 """An auxiliary stream accumulating into a list reference."""
1272 def __init__(self, lst):
1285 return len(self.lst)
1288 def tostringlist(element, encoding=None, method=None, *, short_empty_elements=True):
1290 stream = _ListDataStream(lst)
1291 ElementTree(element).write(
1292 stream, encoding, method=method, short_empty_elements=short_empty_elements
1298 """Write element tree or element structure to sys.stdout.
1300 This function should be used for debugging only.
1302 *elem* is either an ElementTree, or a single Element. The exact output
1303 format is implementation dependent. In this version, it's written as an
1308 if not isinstance(elem, ElementTree):
1309 elem = ElementTree(elem)
1310 elem.write(sys.stdout, encoding="unicode")
1311 tail = elem.getroot().tail
1312 if not tail or tail[-1] != "\n":
1313 sys.stdout.write("\n")
1316 # --------------------------------------------------------------------
1320 def parse(source, parser=None):
1321 """Parse XML document into element tree.
1323 *source* is a filename or file object containing XML data,
1324 *parser* is an optional parser instance defaulting to XMLParser.
1326 Return an ElementTree instance.
1329 tree = ElementTree()
1330 tree.parse(source, parser)
1334 def iterparse(source, events=None, parser=None):
1335 """Incrementally parse XML document into ElementTree.
1337 This class also reports what's going on to the user based on the
1338 *events* it is initialized with. The supported events are the strings
1339 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1340 detailed namespace information). If *events* is omitted, only
1341 "end" events are reported.
1343 *source* is a filename or file object containing XML data, *events* is
1344 a list of events to report back, *parser* is an optional parser instance.
1346 Returns an iterator providing (event, elem) pairs.
1349 close_source = False
1350 if not hasattr(source, "read"):
1351 source = open(source, "rb")
1354 return _IterParseIterator(source, events, parser, close_source)
1361 class XMLPullParser:
1362 def __init__(self, events=None, *, _parser=None):
1363 # The _parser argument is for internal use only and must not be relied
1364 # upon in user code. It will be removed in a future release.
1365 # See http://bugs.python.org/issue17741 for more details.
1367 # _elementtree.c expects a list, not a deque
1368 self._events_queue = []
1370 self._parser = _parser or XMLParser(target=TreeBuilder())
1371 # wire up the parser for event reporting
1374 self._parser._setevents(self._events_queue, events)
1376 def feed(self, data):
1377 """Feed encoded data to parser."""
1378 if self._parser is None:
1379 raise ValueError("feed() called after end of stream")
1382 self._parser.feed(data)
1383 except SyntaxError as exc:
1384 self._events_queue.append(exc)
1386 def _close_and_return_root(self):
1387 # iterparse needs this to set its root attribute properly :(
1388 root = self._parser.close()
1393 """Finish feeding data to parser.
1395 Unlike XMLParser, does not return the root element. Use
1396 read_events() to consume elements from XMLPullParser.
1398 self._close_and_return_root()
1400 def read_events(self):
1401 """Return an iterator over currently available (event, elem) pairs.
1403 Events are consumed from the internal event queue as they are
1404 retrieved from the iterator.
1406 events = self._events_queue
1410 event = events[self._index]
1411 # Avoid retaining references to past events
1412 events[self._index] = None
1416 # Compact the list in a O(1) amortized fashion
1417 # As noted above, _elementree.c needs a list, not a deque
1418 if index * 2 >= len(events):
1423 if isinstance(event, Exception):
1429 class _IterParseIterator:
1430 def __init__(self, source, events, parser, close_source=False):
1431 # Use the internal, undocumented _parser argument for now; When the
1432 # parser argument of iterparse is removed, this can be killed.
1433 self._parser = XMLPullParser(events=events, _parser=parser)
1435 self._close_file = close_source
1436 self.root = self._root = None
1441 for event in self._parser.read_events():
1443 if self._parser._parser is None:
1446 data = self._file.read(16 * 1024)
1448 self._parser.feed(data)
1450 self._root = self._parser._close_and_return_root()
1451 self.root = self._root
1453 if self._close_file:
1456 if self._close_file:
1464 def XML(text, parser=None):
1465 """Parse XML document from string constant.
1467 This function can be used to embed "XML Literals" in Python code.
1469 *text* is a string containing XML data, *parser* is an
1470 optional parser instance, defaulting to the standard XMLParser.
1472 Returns an Element instance.
1476 parser = XMLParser(target=TreeBuilder())
1478 return parser.close()
1481 def XMLID(text, parser=None):
1482 """Parse XML document from string constant for its IDs.
1484 *text* is a string containing XML data, *parser* is an
1485 optional parser instance, defaulting to the standard XMLParser.
1487 Returns an (Element, dict) tuple, in which the
1488 dict maps element id:s to elements.
1492 parser = XMLParser(target=TreeBuilder())
1494 tree = parser.close()
1496 for elem in tree.iter():
1503 # Parse XML document from string constant. Alias for XML().
1507 def fromstringlist(sequence, parser=None):
1508 """Parse XML document from sequence of string fragments.
1510 *sequence* is a list of other sequence, *parser* is an optional parser
1511 instance, defaulting to the standard XMLParser.
1513 Returns an Element instance.
1517 parser = XMLParser(target=TreeBuilder())
1518 for text in sequence:
1520 return parser.close()
1523 # --------------------------------------------------------------------
1527 """Generic element structure builder.
1529 This builder converts a sequence of start, data, and end method
1530 calls to a well-formed element structure.
1532 You can use this class to build an element structure using a custom XML
1533 parser, or a parser for some other XML-like format.
1535 *element_factory* is an optional element factory which is called
1536 to create new Element instances, as necessary.
1540 def __init__(self, element_factory=None):
1541 self._data = [] # data collector
1542 self._elem = [] # element stack
1543 self._last = None # last element
1544 self._tail = None # true if we're after an end tag
1545 if element_factory is None:
1546 element_factory = Element
1547 self._factory = element_factory
1550 """Flush builder buffers and return toplevel document Element."""
1551 assert len(self._elem) == 0, "missing end tags"
1552 assert self._last is not None, "missing toplevel element"
1557 if self._last is not None:
1558 text = "".join(self._data)
1560 assert self._last.tail is None, "internal error (tail)"
1561 self._last.tail = text
1563 assert self._last.text is None, "internal error (text)"
1564 self._last.text = text
1567 def data(self, data):
1568 """Add text to current element."""
1569 self._data.append(data)
1571 def start(self, tag, attrs):
1572 """Open new element and return it.
1574 *tag* is the element name, *attrs* is a dict containing element
1579 self._last = elem = self._factory(tag, attrs)
1581 self._elem[-1].append(elem)
1582 self._elem.append(elem)
1587 """Close and return current Element.
1589 *tag* is the element name.
1593 self._last = self._elem.pop()
1594 assert self._last.tag == tag, "end tag mismatch (expected %s, got %s)" % (
1602 # also see ElementTree and TreeBuilder
1604 """Element structure builder for XML source data based on the expat parser.
1606 *html* are predefined HTML entities (not supported currently),
1607 *target* is an optional target object which defaults to an instance of the
1608 standard TreeBuilder class, *encoding* is an optional encoding string
1609 which if given, overrides the encoding specified in the XML file:
1610 http://www.iana.org/assignments/character-sets
1614 def __init__(self, html=0, target=None, encoding=None):
1616 from xml.parsers import expat
1619 import pyexpat as expat
1622 "No module named expat; use SimpleXMLTreeBuilder instead"
1624 parser = expat.ParserCreate(encoding, "}")
1626 target = TreeBuilder()
1627 # underscored names are provided for compatibility only
1628 self.parser = self._parser = parser
1629 self.target = self._target = target
1630 self._error = expat.error
1631 self._names = {} # name memo cache
1633 parser.DefaultHandlerExpand = self._default
1634 if hasattr(target, "start"):
1635 parser.StartElementHandler = self._start
1636 if hasattr(target, "end"):
1637 parser.EndElementHandler = self._end
1638 if hasattr(target, "data"):
1639 parser.CharacterDataHandler = target.data
1640 # miscellaneous callbacks
1641 if hasattr(target, "comment"):
1642 parser.CommentHandler = target.comment
1643 if hasattr(target, "pi"):
1644 parser.ProcessingInstructionHandler = target.pi
1645 # Configure pyexpat: buffering, new-style attribute handling.
1646 parser.buffer_text = 1
1647 parser.ordered_attributes = 1
1648 parser.specified_attributes = 1
1649 self._doctype = None
1652 self.version = "Expat %d.%d.%d" % expat.version_info
1653 except AttributeError:
1656 def _setevents(self, events_queue, events_to_report):
1657 # Internal API for XMLPullParser
1658 # events_to_report: a list of events to report during parsing (same as
1659 # the *events* of XMLPullParser's constructor.
1660 # events_queue: a list of actual parsing events that will be populated
1661 # by the underlying parser.
1663 parser = self._parser
1664 append = events_queue.append
1665 for event_name in events_to_report:
1666 if event_name == "start":
1667 parser.ordered_attributes = 1
1668 parser.specified_attributes = 1
1671 tag, attrib_in, event=event_name, append=append, start=self._start
1673 append((event, start(tag, attrib_in)))
1675 parser.StartElementHandler = handler
1676 elif event_name == "end":
1678 def handler(tag, event=event_name, append=append, end=self._end):
1679 append((event, end(tag)))
1681 parser.EndElementHandler = handler
1682 elif event_name == "start-ns":
1684 def handler(prefix, uri, event=event_name, append=append):
1685 append((event, (prefix or "", uri or "")))
1687 parser.StartNamespaceDeclHandler = handler
1688 elif event_name == "end-ns":
1690 def handler(prefix, event=event_name, append=append):
1691 append((event, None))
1693 parser.EndNamespaceDeclHandler = handler
1695 raise ValueError("unknown event %r" % event_name)
1697 def _raiseerror(self, value):
1698 err = ParseError(value)
1699 err.code = value.code
1700 err.position = value.lineno, value.offset
1703 def _fixname(self, key):
1704 # expand qname, and convert name string to ascii, if possible
1706 name = self._names[key]
1711 self._names[key] = name
1714 def _start(self, tag, attr_list):
1715 # Handler for expat's StartElementHandler. Since ordered_attributes
1716 # is set, the attributes are reported as a list of alternating
1717 # attribute name,value.
1718 fixname = self._fixname
1722 for i in range(0, len(attr_list), 2):
1723 attrib[fixname(attr_list[i])] = attr_list[i + 1]
1724 return self.target.start(tag, attrib)
1726 def _end(self, tag):
1727 return self.target.end(self._fixname(tag))
1729 def _default(self, text):
1732 # deal with undefined entities
1734 data_handler = self.target.data
1735 except AttributeError:
1738 data_handler(self.entity[text[1:-1]])
1740 from xml.parsers import expat
1743 "undefined entity %s: line %d, column %d"
1744 % (text, self.parser.ErrorLineNumber, self.parser.ErrorColumnNumber)
1746 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1747 err.lineno = self.parser.ErrorLineNumber
1748 err.offset = self.parser.ErrorColumnNumber
1750 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1751 self._doctype = [] # inside a doctype declaration
1752 elif self._doctype is not None:
1753 # parse doctype contents
1755 self._doctype = None
1760 self._doctype.append(text)
1761 n = len(self._doctype)
1763 type = self._doctype[1]
1764 if type == "PUBLIC" and n == 4:
1765 name, type, pubid, system = self._doctype
1768 elif type == "SYSTEM" and n == 3:
1769 name, type, system = self._doctype
1773 if hasattr(self.target, "doctype"):
1774 self.target.doctype(name, pubid, system[1:-1])
1775 elif self.doctype != self._XMLParser__doctype:
1776 # warn about deprecated call
1777 self._XMLParser__doctype(name, pubid, system[1:-1])
1778 self.doctype(name, pubid, system[1:-1])
1779 self._doctype = None
1781 def doctype(self, name, pubid, system):
1782 """(Deprecated) Handle doctype declaration
1784 *name* is the Doctype name, *pubid* is the public identifier,
1785 and *system* is the system identifier.
1789 "This method of XMLParser is deprecated. Define doctype() "
1790 "method on the TreeBuilder target.",
1794 # sentinel, if doctype is redefined in a subclass
1797 def feed(self, data):
1798 """Feed encoded data to parser."""
1800 self.parser.Parse(data, 0)
1801 except self._error as v:
1805 """Finish feeding data to parser and return element structure."""
1807 self.parser.Parse("", 1) # end of data
1808 except self._error as v:
1811 close_handler = self.target.close
1812 except AttributeError:
1815 return close_handler()
1817 # get rid of circular references
1818 del self.parser, self._parser
1819 del self.target, self._target
1822 # Import the C accelerators
1824 # Element is going to be shadowed by the C implementation. We need to keep
1825 # the Python version of it accessible for some "creative" by external code
1827 _Element_Py = Element
1829 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1830 from _elementtree import *