Tools/textwrap.py

   1 """Text wrapping and filling.
   2 """
   3
   4 # Copyright (C) 1999-2001 Gregory P. Ward.
   5 # Copyright (C) 2002, 2003 Python Software Foundation.
   6 # Written by Greg Ward <gward@python.net>
   7
   8 __revision__ = "$Id: textwrap.py,v 1.32.8.3 2004/06/03 01:53:13 gward Exp $"
   9
  10 import string, re
  11
  12 # Do the right thing with boolean values for all known Python versions
  13 # (so this module can be copied to projects that don't depend on Python
  14 # 2.3, e.g. Optik and Docutils).
  15 try:
  16     True, False
  17 except NameError:
  18     (True, False) = (1, 0)
  19
  20 __all__ = ['TextWrapper', 'wrap', 'fill']
  21
  22 # Hardcode the recognized whitespace characters to the US-ASCII
  23 # whitespace characters.  The main reason for doing this is that in
  24 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
  25 # that character winds up in string.whitespace.  Respecting
  26 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
  27 # same as any other whitespace char, which is clearly wrong (it's a
  28 # *non-breaking* space), 2) possibly cause problems with Unicode,
  29 # since 0xa0 is not in range(128).
  30 _whitespace = '\t\n\x0b\x0c\r '
  31
  32 class TextWrapper:
  33     """
  34     Object for wrapping/filling text.  The public interface consists of
  35     the wrap() and fill() methods; the other methods are just there for
  36     subclasses to override in order to tweak the default behaviour.
  37     If you want to completely replace the main wrapping algorithm,
  38     you'll probably have to override _wrap_chunks().
  39
  40     Several instance attributes control various aspects of wrapping:
  41       width (default: 70)
  42         the maximum width of wrapped lines (unless break_long_words
  43         is false)
  44       initial_indent (default: "")
  45         string that will be prepended to the first line of wrapped
  46         output.  Counts towards the line's width.
  47       subsequent_indent (default: "")
  48         string that will be prepended to all lines save the first
  49         of wrapped output; also counts towards each line's width.
  50       expand_tabs (default: true)
  51         Expand tabs in input text to spaces before further processing.
  52         Each tab will become 1 .. 8 spaces, depending on its position in
  53         its line.  If false, each tab is treated as a single character.
  54       replace_whitespace (default: true)
  55         Replace all whitespace characters in the input text by spaces
  56         after tab expansion.  Note that if expand_tabs is false and
  57         replace_whitespace is true, every tab will be converted to a
  58         single space!
  59       fix_sentence_endings (default: false)
  60         Ensure that sentence-ending punctuation is always followed
  61         by two spaces.  Off by default because the algorithm is
  62         (unavoidably) imperfect.
  63       break_long_words (default: true)
  64         Break words longer than 'width'.  If false, those words will not
  65         be broken, and some lines might be longer than 'width'.
  66     """
  67
  68     whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
  69
  70     unicode_whitespace_trans = {}
  71     uspace = ord(u' ')
  72     for x in map(ord, _whitespace):
  73         unicode_whitespace_trans[x] = uspace
  74
  75     # This funky little regex is just the trick for splitting
  76     # text up into word-wrappable chunks.  E.g.
  77     #   "Hello there -- you goof-ball, use the -b option!"
  78     # splits into
  79     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
  80     # (after stripping out empty strings).
  81     wordsep_re = re.compile(r'(\s+|'                  # any whitespace
  82                             r'[^\s\w]*\w{2,}-(?=\w{2,})|' # hyphenated words
  83                             r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
  84
  85     # XXX this is not locale- or charset-aware -- string.lowercase
  86     # is US-ASCII only (and therefore English-only)
  87     sentence_end_re = re.compile(r'[%s]'              # lowercase letter
  88                                  r'[\.\!\?]'          # sentence-ending punct.
  89                                  r'[\"\']?'           # optional end-of-quote
  90                                  % string.lowercase)
  91
  92
  93     def __init__(self,
  94                  width=70,
  95                  initial_indent="",
  96                  subsequent_indent="",
  97                  expand_tabs=True,
  98                  replace_whitespace=True,
  99                  fix_sentence_endings=False,
 100                  break_long_words=True):
 101         self.width = width
 102         self.initial_indent = initial_indent
 103         self.subsequent_indent = subsequent_indent
 104         self.expand_tabs = expand_tabs
 105         self.replace_whitespace = replace_whitespace
 106         self.fix_sentence_endings = fix_sentence_endings
 107         self.break_long_words = break_long_words
 108
 109
 110     # -- Private methods -----------------------------------------------
 111     # (possibly useful for subclasses to override)
 112
 113     def _munge_whitespace(self, text):
 114         """_munge_whitespace(text : string) -> string
 115
 116         Munge whitespace in text: expand tabs and convert all other
 117         whitespace characters to spaces.  Eg. " foo\tbar\n\nbaz"
 118         becomes " foo    bar  baz".
 119         """
 120         if self.expand_tabs:
 121             text = text.expandtabs()
 122         if self.replace_whitespace:
 123             if isinstance(text, str):
 124                 text = text.translate(self.whitespace_trans)
 125             elif isinstance(text, unicode):
 126                 text = text.translate(self.unicode_whitespace_trans)
 127         return text
 128
 129
 130     def _split(self, text):
 131         """_split(text : string) -> [string]
 132
 133         Split the text to wrap into indivisible chunks.  Chunks are
 134         not quite the same as words; see wrap_chunks() for full
 135         details.  As an example, the text
 136           Look, goof-ball -- use the -b option!
 137         breaks into the following chunks:
 138           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
 139           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
 140         """
 141         chunks = self.wordsep_re.split(text)
 142         chunks = filter(None, chunks)
 143         return chunks
 144
 145     def _fix_sentence_endings(self, chunks):
 146         """_fix_sentence_endings(chunks : [string])
 147
 148         Correct for sentence endings buried in 'chunks'.  Eg. when the
 149         original text contains "... foo.\nBar ...", munge_whitespace()
 150         and split() will convert that to [..., "foo.", " ", "Bar", ...]
 151         which has one too few spaces; this method simply changes the one
 152         space to two.
 153         """
 154         i = 0
 155         pat = self.sentence_end_re
 156         while i < len(chunks)-1:
 157             if chunks[i+1] == " " and pat.search(chunks[i]):
 158                 chunks[i+1] = "  "
 159                 i += 2
 160             else:
 161                 i += 1
 162
 163     def _handle_long_word(self, chunks, cur_line, cur_len, width):
 164         """_handle_long_word(chunks : [string],
 165                              cur_line : [string],
 166                              cur_len : int, width : int)
 167
 168         Handle a chunk of text (most likely a word, not whitespace) that
 169         is too long to fit in any line.
 170         """
 171         space_left = max(width - cur_len, 1)
 172
 173         # If we're allowed to break long words, then do so: put as much
 174         # of the next chunk onto the current line as will fit.
 175         if self.break_long_words:
 176             cur_line.append(chunks[0][0:space_left])
 177             chunks[0] = chunks[0][space_left:]
 178
 179         # Otherwise, we have to preserve the long word intact.  Only add
 180         # it to the current line if there's nothing already there --
 181         # that minimizes how much we violate the width constraint.
 182         elif not cur_line:
 183             cur_line.append(chunks.pop(0))
 184
 185         # If we're not allowed to break long words, and there's already
 186         # text on the current line, do nothing.  Next time through the
 187         # main loop of _wrap_chunks(), we'll wind up here again, but
 188         # cur_len will be zero, so the next line will be entirely
 189         # devoted to the long word that we can't handle right now.
 190
 191     def _wrap_chunks(self, chunks):
 192         """_wrap_chunks(chunks : [string]) -> [string]
 193
 194         Wrap a sequence of text chunks and return a list of lines of
 195         length 'self.width' or less.  (If 'break_long_words' is false,
 196         some lines may be longer than this.)  Chunks correspond roughly
 197         to words and the whitespace between them: each chunk is
 198         indivisible (modulo 'break_long_words'), but a line break can
 199         come between any two chunks.  Chunks should not have internal
 200         whitespace; ie. a chunk is either all whitespace or a "word".
 201         Whitespace chunks will be removed from the beginning and end of
 202         lines, but apart from that whitespace is preserved.
 203         """
 204         lines = []
 205         if self.width <= 0:
 206             raise ValueError("invalid width %r (must be > 0)" % self.width)
 207
 208         while chunks:
 209
 210             # Start the list of chunks that will make up the current line.
 211             # cur_len is just the length of all the chunks in cur_line.
 212             cur_line = []
 213             cur_len = 0
 214
 215             # Figure out which static string will prefix this line.
 216             if lines:
 217                 indent = self.subsequent_indent
 218             else:
 219                 indent = self.initial_indent
 220
 221             # Maximum width for this line.
 222             width = self.width - len(indent)
 223
 224             # First chunk on line is whitespace -- drop it, unless this
 225             # is the very beginning of the text (ie. no lines started yet).
 226             if chunks[0].strip() == '' and lines:
 227                 del chunks[0]
 228
 229             while chunks:
 230                 l = len(chunks[0])
 231
 232                 # Can at least squeeze this chunk onto the current line.
 233                 if cur_len + l <= width:
 234                     cur_line.append(chunks.pop(0))
 235                     cur_len += l
 236
 237                 # Nope, this line is full.
 238                 else:
 239                     break
 240
 241             # The current line is full, and the next chunk is too big to
 242             # fit on *any* line (not just this one).
 243             if chunks and len(chunks[0]) > width:
 244                 self._handle_long_word(chunks, cur_line, cur_len, width)
 245
 246             # If the last chunk on this line is all whitespace, drop it.
 247             if cur_line and cur_line[-1].strip() == '':
 248                 del cur_line[-1]
 249
 250             # Convert current line back to a string and store it in list
 251             # of all lines (return value).
 252             if cur_line:
 253                 lines.append(indent + ''.join(cur_line))
 254
 255         return lines
 256
 257
 258     # -- Public interface ----------------------------------------------
 259
 260     def wrap(self, text):
 261         """wrap(text : string) -> [string]
 262
 263         Reformat the single paragraph in 'text' so it fits in lines of
 264         no more than 'self.width' columns, and return a list of wrapped
 265         lines.  Tabs in 'text' are expanded with string.expandtabs(),
 266         and all other whitespace characters (including newline) are
 267         converted to space.
 268         """
 269         text = self._munge_whitespace(text)
 270         indent = self.initial_indent
 271         chunks = self._split(text)
 272         if self.fix_sentence_endings:
 273             self._fix_sentence_endings(chunks)
 274         return self._wrap_chunks(chunks)
 275
 276     def fill(self, text):
 277         """fill(text : string) -> string
 278
 279         Reformat the single paragraph in 'text' to fit in lines of no
 280         more than 'self.width' columns, and return a new string
 281         containing the entire wrapped paragraph.
 282         """
 283         return "\n".join(self.wrap(text))
 284
 285
 286 # -- Convenience interface ---------------------------------------------
 287
 288 def wrap(text, width=70, **kwargs):
 289     """Wrap a single paragraph of text, returning a list of wrapped lines.
 290
 291     Reformat the single paragraph in 'text' so it fits in lines of no
 292     more than 'width' columns, and return a list of wrapped lines.  By
 293     default, tabs in 'text' are expanded with string.expandtabs(), and
 294     all other whitespace characters (including newline) are converted to
 295     space.  See TextWrapper class for available keyword args to customize
 296     wrapping behaviour.
 297     """
 298     w = TextWrapper(width=width, **kwargs)
 299     return w.wrap(text)
 300
 301 def fill(text, width=70, **kwargs):
 302     """Fill a single paragraph of text, returning a new string.
 303
 304     Reformat the single paragraph in 'text' to fit in lines of no more
 305     than 'width' columns, and return a new string containing the entire
 306     wrapped paragraph.  As with wrap(), tabs are expanded and other
 307     whitespace characters converted to space.  See TextWrapper class for
 308     available keyword args to customize wrapping behaviour.
 309     """
 310     w = TextWrapper(width=width, **kwargs)
 311     return w.fill(text)
 312
 313
 314 # -- Loosely related functionality -------------------------------------
 315
 316 def dedent(text):
 317     """dedent(text : string) -> string
 318
 319     Remove any whitespace than can be uniformly removed from the left
 320     of every line in `text`.
 321
 322     This can be used e.g. to make triple-quoted strings line up with
 323     the left edge of screen/whatever, while still presenting it in the
 324     source code in indented form.
 325
 326     For example:
 327
 328         def test():
 329             # end first line with \ to avoid the empty line!
 330             s = '''\
 331             hello
 332               world
 333             '''
 334             print repr(s)          # prints '    hello\n      world\n    '
 335             print repr(dedent(s))  # prints 'hello\n  world\n'
 336     """
 337     lines = text.expandtabs().split('\n')
 338     margin = None
 339     for line in lines:
 340         content = line.lstrip()
 341         if not content:
 342             continue
 343         indent = len(line) - len(content)
 344         if margin is None:
 345             margin = indent
 346         else:
 347             margin = min(margin, indent)
 348
 349     if margin is not None and margin > 0:
 350         for i in range(len(lines)):
 351             lines[i] = lines[i][margin:]
 352
 353     return '\n'.join(lines)