src/patching.py

   1 #!/usr/bin/env python
   2 """ Patch utility to apply unified diffs
   3
   4     Brute-force line-by-line non-recursive parsing
   5
   6     Copyright (c) 2008-2014 anatoly techtonik
   7     Available under the terms of MIT license
   8
   9     Project home: http://code.google.com/p/python-patch/
  10
  11
  12     $Id$
  13     $HeadURL$
  14 """
  15
  16 __author__ = "anatoly techtonik <techtonik@gmail.com>"
  17 __version__ = "1.14dev"
  18
  19 import copy
  20 import logging
  21 import re
  22 # cStringIO doesn't support unicode in 2.5
  23 from StringIO import StringIO
  24 import urllib2
  25
  26 from os.path import exists, isfile, abspath
  27 import os
  28 import posixpath
  29 import shutil
  30
  31
  32 #------------------------------------------------
  33 # Logging is controlled by logger named after the
  34 # module name (e.g. 'patch' for patch.py module)
  35
  36 debugmode = False
  37
  38 logger = logging.getLogger(__name__)
  39
  40 debug = logger.debug
  41 info = logger.info
  42 warning = logger.warning
  43
  44 class NullHandler(logging.Handler):
  45   """ Copied from Python 2.7 to avoid getting
  46       `No handlers could be found for logger "patch"`
  47       http://bugs.python.org/issue16539
  48   """
  49   def handle(self, record):
  50     pass
  51   def emit(self, record):
  52     pass
  53   def createLock(self):
  54     self.lock = None
  55
  56 logger.addHandler(NullHandler())
  57
  58 #------------------------------------------------
  59 # Constants for Patch/PatchSet types
  60
  61 DIFF = PLAIN = "plain"
  62 GIT = "git"
  63 HG = MERCURIAL = "mercurial"
  64 SVN = SUBVERSION = "svn"
  65 # mixed type is only actual when PatchSet contains
  66 # Patches of different type
  67 MIXED = MIXED = "mixed"
  68
  69
  70 #------------------------------------------------
  71 # Helpers (these could come with Python stdlib)
  72
  73 # x...() function are used to work with paths in
  74 # cross-platform manner - all paths use forward
  75 # slashes even on Windows.
  76
  77 def xisabs(filename):
  78   """ Cross-platform version of `os.path.isabs()`
  79       Returns True if `filename` is absolute on
  80       Linux, OS X or Windows.
  81   """
  82   if filename.startswith('/'):     # Linux/Unix
  83     return True
  84   elif filename.startswith('\\'):  # Windows
  85     return True
  86   elif re.match(r'\w:[\\/]', filename): # Windows
  87     return True
  88   return False
  89
  90 def xnormpath(path):
  91   """ Cross-platform version of os.path.normpath """
  92   # replace escapes and Windows slashes
  93   normalized = posixpath.normpath(path).replace('\\', '/')
  94   # fold the result
  95   return posixpath.normpath(normalized)
  96
  97 def xstrip(filename):
  98   """ Make relative path out of absolute by stripping
  99       prefixes used on Linux, OS X and Windows.
 100
 101       This function is critical for security.
 102   """
 103   while xisabs(filename):
 104     # strip windows drive with all slashes
 105     if re.match(r'\w:[\\/]', filename):
 106       filename = re.sub(r'^\w+:[\\/]+', '', filename)
 107     # strip all slashes
 108     elif re.match(r'[\\/]', filename):
 109       filename = re.sub(r'^[\\/]+', '', filename)
 110   return filename
 111
 112 #-----------------------------------------------
 113 # Main API functions
 114
 115 def fromfile(filename):
 116   """ Parse patch file. If successful, returns
 117       PatchSet() object. Otherwise returns False.
 118   """
 119   patchset = PatchSet()
 120   debug("reading %s" % filename)
 121   fp = open(filename, "rb")
 122   res = patchset.parse(fp)
 123   fp.close()
 124   if res == True:
 125     return patchset
 126   return False
 127
 128
 129 def fromstring(s):
 130   """ Parse text string and return PatchSet()
 131       object (or False if parsing fails)
 132   """
 133   ps = PatchSet( StringIO(s) )
 134   if ps.errors == 0:
 135     return ps
 136   return False
 137
 138
 139 def fromurl(url):
 140   """ Parse patch from an URL, return False
 141       if an error occured. Note that this also
 142       can throw urlopen() exceptions.
 143   """
 144   ps = PatchSet( urllib2.urlopen(url) )
 145   if ps.errors == 0:
 146     return ps
 147   return False
 148
 149
 150 # --- Utility functions ---
 151 # [ ] reuse more universal pathsplit()
 152 def pathstrip(path, n):
 153   """ Strip n leading components from the given path """
 154   pathlist = [path]
 155   while os.path.dirname(pathlist[0]) != '':
 156     pathlist[0:1] = os.path.split(pathlist[0])
 157   return '/'.join(pathlist[n:])
 158 # --- /Utility function ---
 159
 160
 161 class Hunk(object):
 162   """ Parsed hunk data container (hunk starts with @@ -R +R @@) """
 163
 164   def __init__(self):
 165     self.startsrc=None #: line count starts with 1
 166     self.linessrc=None
 167     self.starttgt=None
 168     self.linestgt=None
 169     self.invalid=False
 170     self.desc=''
 171     self.text=[]
 172
 173 #  def apply(self, estream):
 174 #    """ write hunk data into enumerable stream
 175 #        return strings one by one until hunk is
 176 #        over
 177 #
 178 #        enumerable stream are tuples (lineno, line)
 179 #        where lineno starts with 0
 180 #    """
 181 #    pass
 182
 183
 184 class Patch(object):
 185   """ Patch for a single file.
 186       If used as an iterable, returns hunks.
 187   """
 188   def __init__(self):
 189     self.source = None
 190     self.target = None
 191     self.hunks = []
 192     self.hunkends = []
 193     self.header = []
 194
 195     self.type = None
 196
 197   def __iter__(self):
 198     for h in self.hunks:
 199       yield h
 200
 201
 202 class PatchSet(object):
 203   """ PatchSet is a patch parser and container.
 204       When used as an iterable, returns patches.
 205   """
 206
 207   def __init__(self, stream=None):
 208     # --- API accessible fields ---
 209
 210     # name of the PatchSet (filename or ...)
 211     self.name = None
 212     # patch set type - one of constants
 213     self.type = None
 214
 215     # list of Patch objects
 216     self.items = []
 217
 218     self.errors = 0    # fatal parsing errors
 219     self.warnings = 0  # non-critical warnings
 220     # --- /API ---
 221
 222     if stream:
 223       self.parse(stream)
 224
 225   def __len__(self):
 226     return len(self.items)
 227
 228   def __iter__(self):
 229     for i in self.items:
 230       yield i
 231
 232   def parse(self, stream):
 233     """ parse unified diff
 234         return True on success
 235     """
 236     lineends = dict(lf=0, crlf=0, cr=0)
 237     nexthunkno = 0    #: even if index starts with 0 user messages number hunks from 1
 238
 239     p = None
 240     hunk = None
 241     # hunkactual variable is used to calculate hunk lines for comparison
 242     hunkactual = dict(linessrc=None, linestgt=None)
 243
 244
 245     class wrapumerate(enumerate):
 246       """Enumerate wrapper that uses boolean end of stream status instead of
 247       StopIteration exception, and properties to access line information.
 248       """
 249
 250       def __init__(self, *args, **kwargs):
 251         # we don't call parent, it is magically created by __new__ method
 252
 253         self._exhausted = False
 254         self._lineno = False     # after end of stream equal to the num of lines
 255         self._line = False       # will be reset to False after end of stream
 256
 257       def next(self):
 258         """Try to read the next line and return True if it is available,
 259            False if end of stream is reached."""
 260         if self._exhausted:
 261           return False
 262
 263         try:
 264           self._lineno, self._line = super(wrapumerate, self).next()
 265         except StopIteration:
 266           self._exhausted = True
 267           self._line = False
 268           return False
 269         return True
 270
 271       @property
 272       def is_empty(self):
 273         return self._exhausted
 274
 275       @property
 276       def line(self):
 277         return self._line
 278
 279       @property
 280       def lineno(self):
 281         return self._lineno
 282
 283     # define states (possible file regions) that direct parse flow
 284     headscan  = True  # start with scanning header
 285     filenames = False # lines starting with --- and +++
 286
 287     hunkhead = False  # @@ -R +R @@ sequence
 288     hunkbody = False  #
 289     hunkskip = False  # skipping invalid hunk mode
 290
 291     hunkparsed = False # state after successfully parsed hunk
 292
 293     # regexp to match start of hunk, used groups - 1,3,4,6
 294     re_hunk_start = re.compile("^@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))? @@")
 295
 296     self.errors = 0
 297     # temp buffers for header and filenames info
 298     header = []
 299     srcname = None
 300     tgtname = None
 301
 302     # start of main cycle
 303     # each parsing block already has line available in fe.line
 304     fe = wrapumerate(stream)
 305     while fe.next():
 306
 307       # -- deciders: these only switch state to decide who should process
 308       # --           line fetched at the start of this cycle
 309       if hunkparsed:
 310         hunkparsed = False
 311         if re_hunk_start.match(fe.line):
 312             hunkhead = True
 313         elif fe.line.startswith("--- "):
 314             filenames = True
 315         else:
 316             headscan = True
 317       # -- ------------------------------------
 318
 319       # read out header
 320       if headscan:
 321         while not fe.is_empty and not fe.line.startswith("--- "):
 322             header.append(fe.line)
 323             fe.next()
 324         if fe.is_empty:
 325             if p == None:
 326               debug("no patch data found")  # error is shown later
 327               self.errors += 1
 328             else:
 329               info("%d unparsed bytes left at the end of stream" % len(''.join(header)))
 330               self.warnings += 1
 331               # TODO check for \No new line at the end..
 332               # TODO test for unparsed bytes
 333               # otherwise error += 1
 334             # this is actually a loop exit
 335             continue
 336
 337         headscan = False
 338         # switch to filenames state
 339         filenames = True
 340
 341       line = fe.line
 342       lineno = fe.lineno
 343
 344
 345       # hunkskip and hunkbody code skipped until definition of hunkhead is parsed
 346       if hunkbody:
 347         # [x] treat empty lines inside hunks as containing single space
 348         #     (this happens when diff is saved by copy/pasting to editor
 349         #      that strips trailing whitespace)
 350         if line.strip("\r\n") == "":
 351             debug("expanding empty line in a middle of hunk body")
 352             self.warnings += 1
 353             line = ' ' + line
 354
 355         # process line first
 356         if re.match(r"^[- \+\\]", line):
 357             # gather stats about line endings
 358             if line.endswith("\r\n"):
 359               p.hunkends["crlf"] += 1
 360             elif line.endswith("\n"):
 361               p.hunkends["lf"] += 1
 362             elif line.endswith("\r"):
 363               p.hunkends["cr"] += 1
 364
 365             if line.startswith("-"):
 366               hunkactual["linessrc"] += 1
 367             elif line.startswith("+"):
 368               hunkactual["linestgt"] += 1
 369             elif not line.startswith("\\"):
 370               hunkactual["linessrc"] += 1
 371               hunkactual["linestgt"] += 1
 372             hunk.text.append(line)
 373             # todo: handle \ No newline cases
 374         else:
 375             warning("invalid hunk no.%d at %d for target file %s" % (nexthunkno, lineno+1, p.target))
 376             # add hunk status node
 377             hunk.invalid = True
 378             p.hunks.append(hunk)
 379             self.errors += 1
 380             # switch to hunkskip state
 381             hunkbody = False
 382             hunkskip = True
 383
 384         # check exit conditions
 385         if hunkactual["linessrc"] > hunk.linessrc or hunkactual["linestgt"] > hunk.linestgt:
 386             warning("extra lines for hunk no.%d at %d for target %s" % (nexthunkno, lineno+1, p.target))
 387             # add hunk status node
 388             hunk.invalid = True
 389             p.hunks.append(hunk)
 390             self.errors += 1
 391             # switch to hunkskip state
 392             hunkbody = False
 393             hunkskip = True
 394         elif hunk.linessrc == hunkactual["linessrc"] and hunk.linestgt == hunkactual["linestgt"]:
 395             # hunk parsed successfully
 396             p.hunks.append(hunk)
 397             # switch to hunkparsed state
 398             hunkbody = False
 399             hunkparsed = True
 400
 401             # detect mixed window/unix line ends
 402             ends = p.hunkends
 403             if ((ends["cr"]!=0) + (ends["crlf"]!=0) + (ends["lf"]!=0)) > 1:
 404               warning("inconsistent line ends in patch hunks for %s" % p.source)
 405               self.warnings += 1
 406             if debugmode:
 407               debuglines = dict(ends)
 408               debuglines.update(file=p.target, hunk=nexthunkno)
 409               debug("crlf: %(crlf)d  lf: %(lf)d  cr: %(cr)d\t - file: %(file)s hunk: %(hunk)d" % debuglines)
 410             # fetch next line
 411             continue
 412
 413       if hunkskip:
 414         if re_hunk_start.match(line):
 415           # switch to hunkhead state
 416           hunkskip = False
 417           hunkhead = True
 418         elif line.startswith("--- "):
 419           # switch to filenames state
 420           hunkskip = False
 421           filenames = True
 422           if debugmode and len(self.items) > 0:
 423             debug("- %2d hunks for %s" % (len(p.hunks), p.source))
 424
 425       if filenames:
 426         if line.startswith("--- "):
 427           if srcname != None:
 428             # XXX testcase
 429             warning("skipping false patch for %s" % srcname)
 430             srcname = None
 431             # XXX header += srcname
 432             # double source filename line is encountered
 433             # attempt to restart from this second line
 434           re_filename = "^--- ([^\t]+)"
 435           match = re.match(re_filename, line)
 436           # todo: support spaces in filenames
 437           if match:
 438             srcname = match.group(1).strip()
 439           else:
 440             warning("skipping invalid filename at line %d" % lineno)
 441             self.errors += 1
 442             # XXX p.header += line
 443             # switch back to headscan state
 444             filenames = False
 445             headscan = True
 446         elif not line.startswith("+++ "):
 447           if srcname != None:
 448             warning("skipping invalid patch with no target for %s" % srcname)
 449             self.errors += 1
 450             srcname = None
 451             # XXX header += srcname
 452             # XXX header += line
 453           else:
 454             # this should be unreachable
 455             warning("skipping invalid target patch")
 456           filenames = False
 457           headscan = True
 458         else:
 459           if tgtname != None:
 460             # XXX seems to be a dead branch
 461             warning("skipping invalid patch - double target at line %d" % lineno)
 462             self.errors += 1
 463             srcname = None
 464             tgtname = None
 465             # XXX header += srcname
 466             # XXX header += tgtname
 467             # XXX header += line
 468             # double target filename line is encountered
 469             # switch back to headscan state
 470             filenames = False
 471             headscan = True
 472           else:
 473             re_filename = "^\+\+\+ ([^\t]+)"
 474             match = re.match(re_filename, line)
 475             if not match:
 476               warning("skipping invalid patch - no target filename at line %d" % lineno)
 477               self.errors += 1
 478               srcname = None
 479               # switch back to headscan state
 480               filenames = False
 481               headscan = True
 482             else:
 483               if p: # for the first run p is None
 484                 self.items.append(p)
 485               p = Patch()
 486               p.source = srcname
 487               srcname = None
 488               p.target = match.group(1).strip()
 489               p.header = header
 490               header = []
 491               # switch to hunkhead state
 492               filenames = False
 493               hunkhead = True
 494               nexthunkno = 0
 495               p.hunkends = lineends.copy()
 496               continue
 497
 498       if hunkhead:
 499         match = re.match("^@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))? @@(.*)", line)
 500         if not match:
 501           if not p.hunks:
 502             warning("skipping invalid patch with no hunks for file %s" % p.source)
 503             self.errors += 1
 504             # XXX review switch
 505             # switch to headscan state
 506             hunkhead = False
 507             headscan = True
 508             continue
 509           else:
 510             # TODO review condition case
 511             # switch to headscan state
 512             hunkhead = False
 513             headscan = True
 514         else:
 515           hunk = Hunk()
 516           hunk.startsrc = int(match.group(1))
 517           hunk.linessrc = 1
 518           if match.group(3): hunk.linessrc = int(match.group(3))
 519           hunk.starttgt = int(match.group(4))
 520           hunk.linestgt = 1
 521           if match.group(6): hunk.linestgt = int(match.group(6))
 522           hunk.invalid = False
 523           hunk.desc = match.group(7)[1:].rstrip()
 524           hunk.text = []
 525
 526           hunkactual["linessrc"] = hunkactual["linestgt"] = 0
 527
 528           # switch to hunkbody state
 529           hunkhead = False
 530           hunkbody = True
 531           nexthunkno += 1
 532           continue
 533
 534     # /while fe.next()
 535
 536     if p:
 537       self.items.append(p)
 538
 539     if not hunkparsed:
 540       if hunkskip:
 541         warning("warning: finished with errors, some hunks may be invalid")
 542       elif headscan:
 543         if len(self.items) == 0:
 544           warning("error: no patch data found!")
 545           return False
 546         else: # extra data at the end of file
 547           pass
 548       else:
 549         warning("error: patch stream is incomplete!")
 550         self.errors += 1
 551         if len(self.items) == 0:
 552           return False
 553
 554     if debugmode and len(self.items) > 0:
 555         debug("- %2d hunks for %s" % (len(p.hunks), p.source))
 556
 557     # XXX fix total hunks calculation
 558     debug("total files: %d  total hunks: %d" % (len(self.items),
 559         sum(len(p.hunks) for p in self.items)))
 560
 561     # ---- detect patch and patchset types ----
 562     for idx, p in enumerate(self.items):
 563       self.items[idx].type = self._detect_type(p)
 564
 565     types = set([p.type for p in self.items])
 566     if len(types) > 1:
 567       self.type = MIXED
 568     else:
 569       self.type = types.pop()
 570     # --------
 571
 572     self._normalize_filenames()
 573
 574     return (self.errors == 0)
 575
 576   def _detect_type(self, p):
 577     """ detect and return type for the specified Patch object
 578         analyzes header and filenames info
 579
 580         NOTE: must be run before filenames are normalized
 581     """
 582
 583     # check for SVN
 584     #  - header starts with Index:
 585     #  - next line is ===... delimiter
 586     #  - filename is followed by revision number
 587     # TODO add SVN revision
 588     if (len(p.header) > 1 and p.header[-2].startswith("Index: ")
 589           and p.header[-1].startswith("="*67)):
 590         return SVN
 591
 592     # common checks for both HG and GIT
 593     DVCS = ((p.source.startswith('a/') or p.source == '/dev/null')
 594         and (p.target.startswith('b/') or p.target == '/dev/null'))
 595
 596     # GIT type check
 597     #  - header[-2] is like "diff --git a/oldname b/newname"
 598     #  - header[-1] is like "index <hash>..<hash> <mode>"
 599     # TODO add git rename diffs and add/remove diffs
 600     #      add git diff with spaced filename
 601     # TODO http://www.kernel.org/pub/software/scm/git/docs/git-diff.html
 602
 603     # detect the start of diff header - there might be some comments before
 604     if len(p.header) > 1:
 605       for idx in reversed(range(len(p.header))):
 606         if p.header[idx].startswith("diff --git"):
 607           break
 608       if re.match(r'diff --git a/[\w/.]+ b/[\w/.]+', p.header[idx]):
 609         if (idx+1 < len(p.header)
 610             and re.match(r'index \w{7}..\w{7} \d{6}', p.header[idx+1])):
 611           if DVCS:
 612             return GIT
 613
 614     # HG check
 615     #
 616     #  - for plain HG format header is like "diff -r b2d9961ff1f5 filename"
 617     #  - for Git-style HG patches it is "diff --git a/oldname b/newname"
 618     #  - filename starts with a/, b/ or is equal to /dev/null
 619     #  - exported changesets also contain the header
 620     #    # HG changeset patch
 621     #    # User name@example.com
 622     #    ...
 623     # TODO add MQ
 624     # TODO add revision info
 625     if len(p.header) > 0:
 626       if DVCS and re.match(r'diff -r \w{12} .*', p.header[-1]):
 627         return HG
 628       if DVCS and p.header[-1].startswith('diff --git a/'):
 629         if len(p.header) == 1:  # native Git patch header len is 2
 630           return HG
 631         elif p.header[0].startswith('# HG changeset patch'):
 632           return HG
 633
 634     return PLAIN
 635
 636
 637   def _normalize_filenames(self):
 638     """ sanitize filenames, normalizing paths, i.e.:
 639         1. strip a/ and b/ prefixes from GIT and HG style patches
 640         2. remove all references to parent directories (with warning)
 641         3. translate any absolute paths to relative (with warning)
 642
 643         [x] always use forward slashes to be crossplatform
 644             (diff/patch were born as a unix utility after all)
 645
 646         return None
 647     """
 648     for i,p in enumerate(self.items):
 649       if p.type in (HG, GIT):
 650         # TODO: figure out how to deal with /dev/null entries
 651         debug("stripping a/ and b/ prefixes")
 652         if p.source != '/dev/null':
 653           if not p.source.startswith("a/"):
 654             warning("invalid source filename")
 655           else:
 656             p.source = p.source[2:]
 657         if p.target != '/dev/null':
 658           if not p.target.startswith("b/"):
 659             warning("invalid target filename")
 660           else:
 661             p.target = p.target[2:]
 662
 663       p.source = xnormpath(p.source)
 664       p.target = xnormpath(p.target)
 665
 666       sep = '/'  # sep value can be hardcoded, but it looks nice this way
 667
 668       # references to parent are not allowed
 669       if p.source.startswith(".." + sep):
 670         warning("error: stripping parent path for source file patch no.%d" % (i+1))
 671         self.warnings += 1
 672         while p.source.startswith(".." + sep):
 673           p.source = p.source.partition(sep)[2]
 674       if p.target.startswith(".." + sep):
 675         warning("error: stripping parent path for target file patch no.%d" % (i+1))
 676         self.warnings += 1
 677         while p.target.startswith(".." + sep):
 678           p.target = p.target.partition(sep)[2]
 679       # absolute paths are not allowed
 680       if xisabs(p.source) or xisabs(p.target):
 681         warning("error: absolute paths are not allowed - file no.%d" % (i+1))
 682         self.warnings += 1
 683         if xisabs(p.source):
 684           warning("stripping absolute path from source name '%s'" % p.source)
 685           p.source = xstrip(p.source)
 686         if xisabs(p.target):
 687           warning("stripping absolute path from target name '%s'" % p.target)
 688           p.target = xstrip(p.target)
 689
 690       self.items[i].source = p.source
 691       self.items[i].target = p.target
 692
 693
 694   def diffstat(self):
 695     """ calculate diffstat and return as a string
 696         Notes:
 697           - original diffstat ouputs target filename
 698           - single + or - shouldn't escape histogram
 699     """
 700     names = []
 701     insert = []
 702     delete = []
 703     delta = 0    # size change in bytes
 704     namelen = 0
 705     maxdiff = 0  # max number of changes for single file
 706                  # (for histogram width calculation)
 707     for patch in self.items:
 708       i,d = 0,0
 709       for hunk in patch.hunks:
 710         for line in hunk.text:
 711           if line.startswith('+'):
 712             i += 1
 713             delta += len(line)-1
 714           elif line.startswith('-'):
 715             d += 1
 716             delta -= len(line)-1
 717       names.append(patch.target)
 718       insert.append(i)
 719       delete.append(d)
 720       namelen = max(namelen, len(patch.target))
 721       maxdiff = max(maxdiff, i+d)
 722     output = ''
 723     statlen = len(str(maxdiff))  # stats column width
 724     for i,n in enumerate(names):
 725       # %-19s | %-4d %s
 726       format = " %-" + str(namelen) + "s | %" + str(statlen) + "s %s\n"
 727
 728       hist = ''
 729       # -- calculating histogram --
 730       width = len(format % ('', '', ''))
 731       histwidth = max(2, 80 - width)
 732       if maxdiff < histwidth:
 733         hist = "+"*insert[i] + "-"*delete[i]
 734       else:
 735         iratio = (float(insert[i]) / maxdiff) * histwidth
 736         dratio = (float(delete[i]) / maxdiff) * histwidth
 737
 738         # make sure every entry gets at least one + or -
 739         iwidth = 1 if 0 < iratio < 1 else int(iratio)
 740         dwidth = 1 if 0 < dratio < 1 else int(dratio)
 741         #print iratio, dratio, iwidth, dwidth, histwidth
 742         hist = "+"*int(iwidth) + "-"*int(dwidth)
 743       # -- /calculating +- histogram --
 744       output += (format % (names[i], insert[i] + delete[i], hist))
 745
 746     output += (" %d files changed, %d insertions(+), %d deletions(-), %+d bytes"
 747                % (len(names), sum(insert), sum(delete), delta))
 748     return output
 749
 750
 751   def findfile(self, old, new):
 752     """ return name of file to be patched or None """
 753     if exists(old):
 754       return old
 755     elif exists(new):
 756       return new
 757     else:
 758       # [w] Google Code generates broken patches with its online editor
 759       debug("broken patch from Google Code, stripping prefixes..")
 760       if old.startswith('a/') and new.startswith('b/'):
 761         old, new = old[2:], new[2:]
 762         debug("   %s" % old)
 763         debug("   %s" % new)
 764         if exists(old):
 765           return old
 766         elif exists(new):
 767           return new
 768       return None
 769
 770
 771   def apply(self, strip=0, root=None):
 772     """ Apply parsed patch, optionally stripping leading components
 773         from file paths. `root` parameter specifies working dir.
 774         return True on success
 775     """
 776     if root:
 777       prevdir = os.getcwd()
 778       os.chdir(root)
 779
 780     total = len(self.items)
 781     errors = 0
 782     if strip:
 783       # [ ] test strip level exceeds nesting level
 784       #   [ ] test the same only for selected files
 785       #     [ ] test if files end up being on the same level
 786       try:
 787         strip = int(strip)
 788       except ValueError:
 789         errors += 1
 790         warning("error: strip parameter '%s' must be an integer" % strip)
 791         strip = 0
 792
 793     #for fileno, filename in enumerate(self.source):
 794     for i,p in enumerate(self.items):
 795       if strip:
 796         debug("stripping %s leading component(s) from:" % strip)
 797         debug("   %s" % p.source)
 798         debug("   %s" % p.target)
 799         old = pathstrip(p.source, strip)
 800         new = pathstrip(p.target, strip)
 801       else:
 802         old, new = p.source, p.target
 803
 804       filename = self.findfile(old, new)
 805
 806       if not filename:
 807           warning("source/target file does not exist:\n  --- %s\n  +++ %s" % (old, new))
 808           errors += 1
 809           continue
 810       if not isfile(filename):
 811         warning("not a file - %s" % filename)
 812         errors += 1
 813         continue
 814
 815       # [ ] check absolute paths security here
 816       debug("processing %d/%d:\t %s" % (i+1, total, filename))
 817
 818       # validate before patching
 819       f2fp = open(filename)
 820       hunkno = 0
 821       hunk = p.hunks[hunkno]
 822       hunkfind = []
 823       hunkreplace = []
 824       validhunks = 0
 825       canpatch = False
 826       for lineno, line in enumerate(f2fp):
 827         if lineno+1 < hunk.startsrc:
 828           continue
 829         elif lineno+1 == hunk.startsrc:
 830           hunkfind = [x[1:].rstrip("\r\n") for x in hunk.text if x[0] in " -"]
 831           hunkreplace = [x[1:].rstrip("\r\n") for x in hunk.text if x[0] in " +"]
 832           #pprint(hunkreplace)
 833           hunklineno = 0
 834
 835           # todo \ No newline at end of file
 836
 837         # check hunks in source file
 838         if lineno+1 < hunk.startsrc+len(hunkfind)-1:
 839           if line.rstrip("\r\n") == hunkfind[hunklineno]:
 840             hunklineno+=1
 841           else:
 842             info("file %d/%d:\t %s" % (i+1, total, filename))
 843             info(" hunk no.%d doesn't match source file at line %d" % (hunkno+1, lineno))
 844             info("  expected: %s" % hunkfind[hunklineno])
 845             info("  actual  : %s" % line.rstrip("\r\n"))
 846             # not counting this as error, because file may already be patched.
 847             # check if file is already patched is done after the number of
 848             # invalid hunks if found
 849             # TODO: check hunks against source/target file in one pass
 850             #   API - check(stream, srchunks, tgthunks)
 851             #           return tuple (srcerrs, tgterrs)
 852
 853             # continue to check other hunks for completeness
 854             hunkno += 1
 855             if hunkno < len(p.hunks):
 856               hunk = p.hunks[hunkno]
 857               continue
 858             else:
 859               break
 860
 861         # check if processed line is the last line
 862         if lineno+1 == hunk.startsrc+len(hunkfind)-1:
 863           debug(" hunk no.%d for file %s  -- is ready to be patched" % (hunkno+1, filename))
 864           hunkno+=1
 865           validhunks+=1
 866           if hunkno < len(p.hunks):
 867             hunk = p.hunks[hunkno]
 868           else:
 869             if validhunks == len(p.hunks):
 870               # patch file
 871               canpatch = True
 872               break
 873       else:
 874         if hunkno < len(p.hunks):
 875           warning("premature end of source file %s at hunk %d" % (filename, hunkno+1))
 876           errors += 1
 877
 878       f2fp.close()
 879
 880       if validhunks < len(p.hunks):
 881         if self._match_file_hunks(filename, p.hunks):
 882           warning("already patched  %s" % filename)
 883         else:
 884           warning("source file is different - %s" % filename)
 885           errors += 1
 886       if canpatch:
 887         backupname = filename+".orig"
 888         if exists(backupname):
 889           warning("can't backup original file to %s - aborting" % backupname)
 890         else:
 891           import shutil
 892           shutil.move(filename, backupname)
 893           if self.write_hunks(backupname, filename, p.hunks):
 894             info("successfully patched %d/%d:\t %s" % (i+1, total, filename))
 895             os.unlink(backupname)
 896           else:
 897             errors += 1
 898             warning("error patching file %s" % filename)
 899             shutil.copy(filename, filename+".invalid")
 900             warning("invalid version is saved to %s" % filename+".invalid")
 901             # todo: proper rejects
 902             shutil.move(backupname, filename)
 903
 904     if root:
 905       os.chdir(prevdir)
 906
 907     # todo: check for premature eof
 908     return (errors == 0)
 909
 910
 911   def _reverse(self):
 912     """ reverse patch direction (this doesn't touch filenames) """
 913     for p in self.items:
 914       for h in p.hunks:
 915         h.startsrc, h.starttgt = h.starttgt, h.startsrc
 916         h.linessrc, h.linestgt = h.linestgt, h.linessrc
 917         for i,line in enumerate(h.text):
 918           if line[0] == '+':
 919             h.text[i] = '-' + line[1:]
 920           elif line[0] == '-':
 921             h.text[i] = '+' +line[1:]
 922
 923   def revert(self, strip=0, root=None):
 924     """ apply patch in reverse order """
 925     reverted = copy.deepcopy(self)
 926     reverted._reverse()
 927     return reverted.apply(strip, root)
 928
 929
 930   def can_patch(self, filename):
 931     """ Check if specified filename can be patched. Returns None if file can
 932     not be found among source filenames. False if patch can not be applied
 933     clearly. True otherwise.
 934
 935     :returns: True, False or None
 936     """
 937     filename = abspath(filename)
 938     for p in self.items:
 939       if filename == abspath(p.source):
 940         return self._match_file_hunks(filename, p.hunks)
 941     return None
 942
 943
 944   def _match_file_hunks(self, filepath, hunks):
 945     matched = True
 946     fp = open(abspath(filepath))
 947
 948     class NoMatch(Exception):
 949       pass
 950
 951     lineno = 1
 952     line = fp.readline()
 953     hno = None
 954     try:
 955       for hno, h in enumerate(hunks):
 956         # skip to first line of the hunk
 957         while lineno < h.starttgt:
 958           if not len(line): # eof
 959             debug("check failed - premature eof before hunk: %d" % (hno+1))
 960             raise NoMatch
 961           line = fp.readline()
 962           lineno += 1
 963         for hline in h.text:
 964           if hline.startswith("-"):
 965             continue
 966           if not len(line):
 967             debug("check failed - premature eof on hunk: %d" % (hno+1))
 968             # todo: \ No newline at the end of file
 969             raise NoMatch
 970           if line.rstrip("\r\n") != hline[1:].rstrip("\r\n"):
 971             debug("file is not patched - failed hunk: %d" % (hno+1))
 972             raise NoMatch
 973           line = fp.readline()
 974           lineno += 1
 975
 976     except NoMatch:
 977       matched = False
 978       # todo: display failed hunk, i.e. expected/found
 979
 980     fp.close()
 981     return matched
 982
 983
 984   def patch_stream(self, instream, hunks):
 985     """ Generator that yields stream patched with hunks iterable
 986
 987         Converts lineends in hunk lines to the best suitable format
 988         autodetected from input
 989     """
 990
 991     # todo: At the moment substituted lineends may not be the same
 992     #       at the start and at the end of patching. Also issue a
 993     #       warning/throw about mixed lineends (is it really needed?)
 994
 995     hunks = iter(hunks)
 996
 997     srclineno = 1
 998
 999     lineends = {'\n':0, '\r\n':0, '\r':0}
1000     def get_line():
1001       """
1002       local utility function - return line from source stream
1003       collecting line end statistics on the way
1004       """
1005       line = instream.readline()
1006         # 'U' mode works only with text files
1007       if line.endswith("\r\n"):
1008         lineends["\r\n"] += 1
1009       elif line.endswith("\n"):
1010         lineends["\n"] += 1
1011       elif line.endswith("\r"):
1012         lineends["\r"] += 1
1013       return line
1014
1015     for hno, h in enumerate(hunks):
1016       debug("hunk %d" % (hno+1))
1017       # skip to line just before hunk starts
1018       while srclineno < h.startsrc:
1019         yield get_line()
1020         srclineno += 1
1021
1022       for hline in h.text:
1023         # todo: check \ No newline at the end of file
1024         if hline.startswith("-") or hline.startswith("\\"):
1025           get_line()
1026           srclineno += 1
1027           continue
1028         else:
1029           if not hline.startswith("+"):
1030             get_line()
1031             srclineno += 1
1032           line2write = hline[1:]
1033           # detect if line ends are consistent in source file
1034           if sum([bool(lineends[x]) for x in lineends]) == 1:
1035             newline = [x for x in lineends if lineends[x] != 0][0]
1036             yield line2write.rstrip("\r\n")+newline
1037           else: # newlines are mixed
1038             yield line2write
1039
1040     for line in instream:
1041       yield line
1042
1043
1044   def write_hunks(self, srcname, tgtname, hunks):
1045     src = open(srcname, "rb")
1046     tgt = open(tgtname, "wb")
1047
1048     debug("processing target file %s" % tgtname)
1049
1050     tgt.writelines(self.patch_stream(src, hunks))
1051
1052     tgt.close()
1053     src.close()
1054     # [ ] TODO: add test for permission copy
1055     shutil.copymode(srcname, tgtname)
1056     return True
1057
1058
1059   def dump(self):
1060     for p in self.items:
1061       for headline in p.header:
1062         print headline.rstrip('\n')
1063       print '--- ' + p.source
1064       print '+++ ' + p.target
1065       for h in p.hunks:
1066         print '@@ -%s,%s +%s,%s @@' % (h.startsrc, h.linessrc, h.starttgt, h.linestgt)
1067         for line in h.text:
1068           print line.rstrip('\n')
1069
1070
1071 def main():
1072   from optparse import OptionParser
1073   from os.path import exists
1074   import sys
1075
1076   opt = OptionParser(usage="1. %prog [options] unified.diff\n"
1077                     "       2. %prog [options] http://host/patch\n"
1078                     "       3. %prog [options] -- < unified.diff",
1079                      version="python-patch %s" % __version__)
1080   opt.add_option("-q", "--quiet", action="store_const", dest="verbosity",
1081                                   const=0, help="print only warnings and errors", default=1)
1082   opt.add_option("-v", "--verbose", action="store_const", dest="verbosity",
1083                                   const=2, help="be verbose")
1084   opt.add_option("--debug", action="store_true", dest="debugmode", help="debug mode")
1085   opt.add_option("--diffstat", action="store_true", dest="diffstat",
1086                                            help="print diffstat and exit")
1087   opt.add_option("-d", "--directory", metavar='DIR',
1088                                            help="specify root directory for applying patch")
1089   opt.add_option("-p", "--strip", type="int", metavar='N', default=0,
1090                                            help="strip N path components from filenames")
1091   opt.add_option("--revert", action="store_true",
1092                                            help="apply patch in reverse order (unpatch)")
1093   (options, args) = opt.parse_args()
1094
1095   if not args and sys.argv[-1:] != ['--']:
1096     opt.print_version()
1097     opt.print_help()
1098     sys.exit()
1099   readstdin = (sys.argv[-1:] == ['--'] and not args)
1100
1101   debugmode = options.debugmode
1102
1103   verbosity_levels = {0:logging.WARNING, 1:logging.INFO, 2:logging.DEBUG}
1104   loglevel = verbosity_levels[options.verbosity]
1105   logformat = "%(message)s"
1106   if debugmode:
1107     loglevel = logging.DEBUG
1108     logformat = "%(levelname)8s %(message)s"
1109   logger.setLevel(loglevel)
1110   loghandler = logging.StreamHandler()
1111   loghandler.setFormatter(logging.Formatter(logformat))
1112   logger.addHandler(loghandler)
1113
1114
1115   if readstdin:
1116     patch = PatchSet(sys.stdin)
1117   else:
1118     patchfile = args[0]
1119     urltest = patchfile.split(':')[0]
1120     if (':' in patchfile and urltest.isalpha()
1121         and len(urltest) > 1): # one char before : is a windows drive letter
1122       patch = fromurl(patchfile)
1123     else:
1124       if not exists(patchfile) or not isfile(patchfile):
1125         sys.exit("patch file does not exist - %s" % patchfile)
1126       patch = fromfile(patchfile)
1127
1128   if options.diffstat:
1129     print patch.diffstat()
1130     sys.exit(0)
1131
1132   #pprint(patch)
1133   if options.revert:
1134     patch.revert(options.strip, root=options.directory) or sys.exit(-1)
1135   else:
1136     patch.apply(options.strip, root=options.directory) or sys.exit(-1)
1137
1138   # todo: document and test line ends handling logic - patch.py detects proper line-endings
1139   #       for inserted hunks and issues a warning if patched file has incosistent line ends
1140
1141
1142 if __name__ == "__main__":
1143   main()
1144
1145 # Legend:
1146 # [ ]  - some thing to be done
1147 # [w]  - official wart, external or internal that is unlikely to be fixed
1148
1149 # [ ] API break (2.x) wishlist
1150 # PatchSet.items  -->  PatchSet.patches
1151
1152 # [ ] run --revert test for all dataset items
1153 # [ ] run .parse() / .dump() test for dataset
1154