1 #!/usr/bin/env python
    2 
    3 # Copyright 2003 Tom Rothamel <tom-potw@rothamel.us>
    4 # 
    5 # Permission is hereby granted, free of charge, to any person
    6 # obtaining a copy of this software and associated documentation files
    7 # (the "Software"), to deal in the Software without restriction,
    8 # including without limitation the rights to use, copy, modify, merge,
    9 # publish, distribute, sublicense, and/or sell copies of the Software,
   10 # and to permit persons to whom the Software is furnished to do so,
   11 # subject to the following conditions:
   12 # 
   13 # The above copyright notice and this permission notice shall be
   14 # included in all copies or substantial portions of the Software.
   15 # 
   16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
   20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
   21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
   22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   23 
   24 import re
   25 import sys
   26 import optparse
   27 
   28 # NOTE: In comments, PRM is used to refer to the Python Reference
   29 # Manual, Python 2.3.2 edition.
   30 
   31 # This is a map from a symbolic style name to a CSS style, allowing us
   32 # to change the CSS styles in one convenient place.
   33 styles = {
   34     'string' : 'color: #006000;',
   35     'comment' : 'color: #800000;',
   36     'keyword' : 'color: #804000;',
   37     'highlight' : 'color: #000080;',
   38     }
   39 
   40 ##############################################################################
   41 
   42 # The singleton state object, accessed by the tokenizers.
   43 class State(object):
   44     """Fields on this object store state that needs to be passed between
   45     tokens.
   46     """
   47 
   48     def __init__(self):
   49 
   50         # This is used to indicate if the next name should be hilighted.
   51         # It's sent to True when def or class are seen, and is used to
   52         # hilight the name of the construct being defined.
   53         self.highlight_name = False
   54 
   55 # The following take in a matched token, and return a list of
   56 # style, text pairs. 
   57         
   58 def tok_comment(m, state):
   59     """Returns a formatted comment."""
   60     return [ ('comment', m.group(0)) ]
   61 
   62 
   63 def tok_whitespace(m, state):
   64     """Returns formatted whitespace."""
   65     return [ (None, m.group(0)) ]
   66 
   67 # The set of all keywords defined in python. as and None will become
   68 # keywords in a future release, but we include them now.
   69 # (PRM 2.3.1)
   70 keywords = """
   71 and       del       for       is        raise    
   72 assert    elif      from      lambda    return   
   73 break     else      global    not       try      
   74 class     except    if        or        while    
   75 continue  exec      import    pass      yield    
   76 def       finally   in        print     as
   77 None
   78 """.split()
   79 
   80 def tok_name(m, state):
   81     """Returns a formatted name."""
   82 
   83     name = m.group(0)
   84 
   85     # If we have a function or class definition, remember to highlight
   86     # the next name.
   87     if name in ("def", "class"):
   88         state.highlight_name = True
   89 
   90     # If we have a keyword, style it.
   91     if name in keywords:
   92         return [ ("keyword", m.group(0)) ]
   93 
   94     # If we want to highlight the next name, do so.
   95     if state.highlight_name:
   96         state.highlight_name = False
   97         return [ ("highlight", m.group(0)) ]
   98     
   99     # Otherwise, don't style anything.
  100     return [ (None, m.group(0)) ]
  101 
  102 def tok_number(m, state):
  103     """Returns a formatted number."""
  104 
  105     return [ (None, m.group(0)) ]
  106 
  107 def tok_other(m, state):
  108     """Returns a formatted other (this usually matches operators)."""
  109     
  110     return [ (None, m.group(0)) ]
  111 
  112 def tok_string(m, state):
  113     """Returns a formatted string."""
  114 
  115     # Style the body of the string, but leave the delimiters unstyled.
  116     return [ (None, m.group('start')),
  117              ("string", m.group('body')),
  118              (None, m.group('end')) ]
  119 
  120 ##########################################################################
  121 
  122 def stringpat(delim):
  123     """Given a delimiters, return a regular expression that matches
  124     strings delimited by those characters.
  125     """
  126 
  127     # How to match characters that can be in the string but are not
  128     # the delimiter.
  129 
  130 
  131     # (?<!\\) asserts that the character is not preceded by a \.
  132     # So, this matches the shortest string such that the delimiter
  133     # is not preceded by an odd number of backslashes.
  134     #
  135     # More precisely, we match the shortest string of the form:
  136     #
  137     # opening
  138     # anything
  139     # something other than a backslash
  140     # any even number of backslashes
  141     # closing
  142         
  143     nodelim = r"(.*?(?<!\\)(?:\\\\)*)?"
  144 
  145     # Taken from PRM 2.4.1
  146     return ( r"(?P<start>[ru]*%s)" \
  147              r"(?P<body>%s)" \
  148              r"(?P<end>%s)" \
  149              % (delim, nodelim, delim) )
  150 
  151 # This is a list of regular expression, function pairs. It's used to
  152 # tokenize the file. When a regular expression matches, the match
  153 # object created is passed to the corresponding function, which is
  154 # responsible for returning formatted text.
  155 patterns = [
  156 
  157     # PRM 2.4.1 (string literals)
  158     ( stringpat('"""'), tok_string),
  159     ( stringpat('"'), tok_string),
  160     ( stringpat("'''"), tok_string),
  161     ( stringpat("'"), tok_string),
  162 
  163     # PRM 2.4.6 (imaginary), gives j at the end of the next few patterns.
  164 
  165     # PRM 2.4.5 (floating point)
  166     ( r"(\d*\.\d+|\d+\.)(e[+-]?\d+)?j?", tok_number),
  167 
  168     # PRM 2.4.4 (integers)
  169     ( r"0x?[0-9a-f]+l?j?", tok_number),
  170     ( r"0[0-7]+l?j?", tok_number ),
  171     ( r"\d+l?j?", tok_number ),
  172 
  173     # The rest of these are mostly common sense.
  174 
  175     # Names.
  176     ( r"\w+", tok_name ),
  177     # Whitespace.
  178     ( r"\s+", tok_whitespace ),
  179     # Comments.
  180     ( r"#[^\r\n]*", tok_comment),    
  181 
  182     # Everything that doesn't match something else matches here. 
  183     ( r"[^\w\s\"\'\#]+", tok_other),
  184     ]
  185 
  186 # Compile the patterns.
  187 patterns = [ (re.compile(regex, re.S | re.I), action)
  188              for regex, action in patterns ]
  189 
  190 
  191 def tokenize(source):
  192     """This a generator that tokenizes the source into a a sequence
  193     of style, text pairs.
  194     """
  195     
  196     lens = len(source)
  197     pos = 0
  198 
  199     # Instatiate the state object.
  200     state = State()
  201 
  202     # We advance along the string...
  203     while pos < lens:
  204 
  205         # ...finding a regex that matches...
  206         for regex, action in patterns:
  207             m = regex.match(source, pos)
  208 
  209             if m:
  210                 break
  211         else:
  212             # If no regex matches, it's an error. (Should never happen.)
  213             print >>sys.stderr, repr(source[pos:pos + 200])            
  214             print >>sys.stderr
  215             raise Exception, "Didn't match token."
  216 
  217         # ... and calling the appropriate action with the match and
  218         # state objects.
  219 
  220         tokens = action(m, state)
  221 
  222         # This gives us a list of style, text tokens, which we then
  223         # yield up for formatting.
  224         for style, text in tokens:            
  225             yield style, text
  226 
  227         # Update the new position in the string to the end of the last
  228         # match.
  229         pos = m.end()
  230 
  231 ##########################################################################
  232 
  233 def quote(s):
  234     """Escapes characters that would otherwise have special meanings
  235     in html. Only works for text, not attributes.
  236     """
  237 
  238     s = re.sub(r"&", "&amp;", s)
  239     s = re.sub(r"<", "&lt;", s)
  240 
  241     return s
  242 
  243 def format(source, out):
  244     """Formats the source as html, and sends it to the file out. fn is
  245     a filename associated with the source being formatted, which is used
  246     to make <a name> anchors for each line number.
  247     """
  248 
  249     line = 1
  250 
  251     # This is what we do at the start of each line. (Print out a
  252     # line number and anchor.)
  253     def newline():
  254         if options.line_numbers:
  255             out.write('<a name="%d">% 5d</a> ' % (line, line))
  256         else:
  257             out.write('<a name="%d" /> ' % line)
  258 
  259     newline()
  260 
  261     for style, text in tokenize(source):
  262 
  263         # Further split the text into chunks of text,
  264         # separated by newlines.
  265         for t in re.split(r"(\n)", text):
  266 
  267             if t == '\n':
  268                 # If we have a newline, write it out.
  269                 out.write(t)
  270 
  271                 # We have to do this increment here to get the scopes
  272                 # right.
  273                 line += 1
  274 
  275                 # Write out the line preamble.
  276                 newline()
  277 
  278             elif style:
  279                 # If we have a style, emit a quoted and styled block of text.
  280                 out.write('<span style="%s">%s</span>'
  281                           % (styles[style], quote(t)))
  282                 
  283             else:
  284                 # Otherwise, emit a block of text that is quoted, but not
  285                 # styled.
  286                 out.write(quote(t))
  287                 
  288 
  289 def main():
  290 
  291     # Parse the options.
  292 
  293     usage = """%prog <files>
  294 
  295 For each python source file listed in <files>, generates file.html
  296 containing colorized html, anchors, and optional line numbers.
  297 """
  298 
  299     op = optparse.OptionParser(usage=usage, version="%prog 1")    
  300     op.add_option("-n", "--line-numbers",
  301                   action="store_true", dest="line_numbers", default=False,
  302                   help="Prefix each line with its line number.")
  303 
  304     global options
  305     options, args = op.parse_args()
  306 
  307     if len(args) == 0:
  308         op.error('You must specify at least one file to process.')
  309 
  310 
  311     # Iterate over the files, converting each to html.
  312     
  313     for fn in args:
  314         print fn
  315 
  316         # Open the input and output files.
  317         f = file(fn, "rU")
  318         out = file(fn + ".html", "wU")
  319 
  320         # Write out a quick html header.
  321         out.write("<html><head><title>%s</title></head><body>\n" % fn)
  322         out.write("<pre>\n")
  323 
  324         # Format the python source.
  325         format(f.read(), out)
  326 
  327         # Write out a quick html footer.
  328         out.write("</pre>\n")
  329         out.write("</body></html>\n")
  330 
  331         out.close()
  332         f.close()
  333 
  334 
  335 if __name__ == "__main__":
  336     main()
  337