src/ElementPath.py

   1 #
   2 # ElementTree
   3 # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
   4 #
   5 # limited xpath support for element trees
   6 #
   7 # history:
   8 # 2003-05-23 fl   created
   9 # 2003-05-28 fl   added support for // etc
  10 # 2003-08-27 fl   fixed parsing of periods in element names
  11 # 2007-09-10 fl   new selection engine
  12 # 2007-09-12 fl   fixed parent selector
  13 # 2007-09-13 fl   added iterfind; changed findall to return a list
  14 # 2007-11-30 fl   added namespaces support
  15 # 2009-10-30 fl   added child element value filter
  16 #
  17 # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
  18 #
  19 # fredrik@pythonware.com
  20 # http://www.pythonware.com
  21 #
  22 # --------------------------------------------------------------------
  23 # The ElementTree toolkit is
  24 #
  25 # Copyright (c) 1999-2009 by Fredrik Lundh
  26 #
  27 # By obtaining, using, and/or copying this software and/or its
  28 # associated documentation, you agree that you have read, understood,
  29 # and will comply with the following terms and conditions:
  30 #
  31 # Permission to use, copy, modify, and distribute this software and
  32 # its associated documentation for any purpose and without fee is
  33 # hereby granted, provided that the above copyright notice appears in
  34 # all copies, and that both that copyright notice and this permission
  35 # notice appear in supporting documentation, and that the name of
  36 # Secret Labs AB or the author not be used in advertising or publicity
  37 # pertaining to distribution of the software without specific, written
  38 # prior permission.
  39 #
  40 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  41 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  42 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  43 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  44 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  45 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  46 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  47 # OF THIS SOFTWARE.
  48 # --------------------------------------------------------------------
  49
  50 # Licensed to PSF under a Contributor Agreement.
  51 # See http://www.python.org/psf/license for licensing details.
  52
  53 ##
  54 # Implementation module for XPath support.  There's usually no reason
  55 # to import this module directly; the <b>ElementTree</b> does this for
  56 # you, if needed.
  57 ##
  58
  59 import re
  60
  61 xpath_tokenizer_re = re.compile(
  62     "("
  63     "'[^']*'|\"[^\"]*\"|"
  64     "::|"
  65     "//?|"
  66     "\.\.|"
  67     "\(\)|"
  68     "[/.*:\[\]\(\)@=])|"
  69     "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
  70     "\s+"
  71 )
  72
  73
  74 def xpath_tokenizer(pattern, namespaces=None):
  75     for token in xpath_tokenizer_re.findall(pattern):
  76         tag = token[1]
  77         if tag and tag[0] != "{" and ":" in tag:
  78             try:
  79                 prefix, uri = tag.split(":", 1)
  80                 if not namespaces:
  81                     raise KeyError
  82                 yield token[0], "{%s}%s" % (namespaces[prefix], uri)
  83             except KeyError:
  84                 raise SyntaxError("prefix %r not found in prefix map" % prefix)
  85         else:
  86             yield token
  87
  88
  89 def get_parent_map(context):
  90     parent_map = context.parent_map
  91     if parent_map is None:
  92         context.parent_map = parent_map = {}
  93         for p in context.root.iter():
  94             for e in p:
  95                 parent_map[e] = p
  96     return parent_map
  97
  98
  99 def prepare_child(next, token):
 100     tag = token[1]
 101
 102     def select(context, result):
 103         for elem in result:
 104             for e in elem:
 105                 if e.tag == tag:
 106                     yield e
 107
 108     return select
 109
 110
 111 def prepare_star(next, token):
 112     def select(context, result):
 113         for elem in result:
 114             yield from elem
 115
 116     return select
 117
 118
 119 def prepare_self(next, token):
 120     def select(context, result):
 121         yield from result
 122
 123     return select
 124
 125
 126 def prepare_descendant(next, token):
 127     token = next()
 128     if token[0] == "*":
 129         tag = "*"
 130     elif not token[0]:
 131         tag = token[1]
 132     else:
 133         raise SyntaxError("invalid descendant")
 134
 135     def select(context, result):
 136         for elem in result:
 137             for e in elem.iter(tag):
 138                 if e is not elem:
 139                     yield e
 140
 141     return select
 142
 143
 144 def prepare_parent(next, token):
 145     def select(context, result):
 146         # FIXME: raise error if .. is applied at toplevel?
 147         parent_map = get_parent_map(context)
 148         result_map = {}
 149         for elem in result:
 150             if elem in parent_map:
 151                 parent = parent_map[elem]
 152                 if parent not in result_map:
 153                     result_map[parent] = None
 154                     yield parent
 155
 156     return select
 157
 158
 159 def prepare_predicate(next, token):
 160     # FIXME: replace with real parser!!! refs:
 161     # http://effbot.org/zone/simple-iterator-parser.htm
 162     # http://javascript.crockford.com/tdop/tdop.html
 163     signature = []
 164     predicate = []
 165     while 1:
 166         token = next()
 167         if token[0] == "]":
 168             break
 169         if token[0] and token[0][:1] in "'\"":
 170             token = "'", token[0][1:-1]
 171         signature.append(token[0] or "-")
 172         predicate.append(token[1])
 173     signature = "".join(signature)
 174     # use signature to determine predicate type
 175     if signature == "@-":
 176         # [@attribute] predicate
 177         key = predicate[1]
 178
 179         def select(context, result):
 180             for elem in result:
 181                 if elem.get(key) is not None:
 182                     yield elem
 183
 184         return select
 185     if signature == "@-='":
 186         # [@attribute='value']
 187         key = predicate[1]
 188         value = predicate[-1]
 189
 190         def select(context, result):
 191             for elem in result:
 192                 if elem.get(key) == value:
 193                     yield elem
 194
 195         return select
 196     if signature == "-" and not re.match("\-?\d+$", predicate[0]):
 197         # [tag]
 198         tag = predicate[0]
 199
 200         def select(context, result):
 201             for elem in result:
 202                 if elem.find(tag) is not None:
 203                     yield elem
 204
 205         return select
 206     if signature == "-='" and not re.match("\-?\d+$", predicate[0]):
 207         # [tag='value']
 208         tag = predicate[0]
 209         value = predicate[-1]
 210
 211         def select(context, result):
 212             for elem in result:
 213                 for e in elem.findall(tag):
 214                     if "".join(e.itertext()) == value:
 215                         yield elem
 216                         break
 217
 218         return select
 219     if signature == "-" or signature == "-()" or signature == "-()-":
 220         # [index] or [last()] or [last()-index]
 221         if signature == "-":
 222             # [index]
 223             index = int(predicate[0]) - 1
 224             if index < 0:
 225                 raise SyntaxError("XPath position >= 1 expected")
 226         else:
 227             if predicate[0] != "last":
 228                 raise SyntaxError("unsupported function")
 229             if signature == "-()-":
 230                 try:
 231                     index = int(predicate[2]) - 1
 232                 except ValueError:
 233                     raise SyntaxError("unsupported expression")
 234                 if index > -2:
 235                     raise SyntaxError("XPath offset from last() must be negative")
 236             else:
 237                 index = -1
 238
 239         def select(context, result):
 240             parent_map = get_parent_map(context)
 241             for elem in result:
 242                 try:
 243                     parent = parent_map[elem]
 244                     # FIXME: what if the selector is "*" ?
 245                     elems = list(parent.findall(elem.tag))
 246                     if elems[index] is elem:
 247                         yield elem
 248                 except (IndexError, KeyError):
 249                     pass
 250
 251         return select
 252     raise SyntaxError("invalid predicate")
 253
 254
 255 ops = {
 256     "": prepare_child,
 257     "*": prepare_star,
 258     ".": prepare_self,
 259     "..": prepare_parent,
 260     "//": prepare_descendant,
 261     "[": prepare_predicate,
 262 }
 263
 264 _cache = {}
 265
 266
 267 class _SelectorContext:
 268     parent_map = None
 269
 270     def __init__(self, root):
 271         self.root = root
 272
 273
 274 # --------------------------------------------------------------------
 275
 276 ##
 277 # Generate all matching objects.
 278
 279
 280 def iterfind(elem, path, namespaces=None):
 281     # compile selector pattern
 282     cache_key = (
 283         path,
 284         None if namespaces is None else tuple(sorted(namespaces.items())),
 285     )
 286     if path[-1:] == "/":
 287         path = path + "*"  # implicit all (FIXME: keep this?)
 288     try:
 289         selector = _cache[cache_key]
 290     except KeyError:
 291         if len(_cache) > 100:
 292             _cache.clear()
 293         if path[:1] == "/":
 294             raise SyntaxError("cannot use absolute path on element")
 295         next = iter(xpath_tokenizer(path, namespaces)).__next__
 296         token = next()
 297         selector = []
 298         while 1:
 299             try:
 300                 selector.append(ops[token[0]](next, token))
 301             except StopIteration:
 302                 raise SyntaxError("invalid path")
 303             try:
 304                 token = next()
 305                 if token[0] == "/":
 306                     token = next()
 307             except StopIteration:
 308                 break
 309         _cache[cache_key] = selector
 310     # execute selector pattern
 311     result = [elem]
 312     context = _SelectorContext(elem)
 313     for select in selector:
 314         result = select(context, result)
 315     return result
 316
 317
 318 ##
 319 # Find first matching object.
 320
 321
 322 def find(elem, path, namespaces=None):
 323     try:
 324         return next(iterfind(elem, path, namespaces))
 325     except StopIteration:
 326         return None
 327
 328
 329 ##
 330 # Find all matching objects.
 331
 332
 333 def findall(elem, path, namespaces=None):
 334     return list(iterfind(elem, path, namespaces))
 335
 336
 337 ##
 338 # Find text for first matching object.
 339
 340
 341 def findtext(elem, path, default=None, namespaces=None):
 342     try:
 343         elem = next(iterfind(elem, path, namespaces))
 344         return elem.text or ""
 345     except StopIteration:
 346         return default