1 #!/usr/bin/env python
2
3 # Copyright 2003 Tom Rothamel <tom-potw@rothamel.us>
4 #
5 # Permission is hereby granted, free of charge, to any person
6 # obtaining a copy of this software and associated documentation files
7 # (the "Software"), to deal in the Software without restriction,
8 # including without limitation the rights to use, copy, modify, merge,
9 # publish, distribute, sublicense, and/or sell copies of the Software,
10 # and to permit persons to whom the Software is furnished to do so,
11 # subject to the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 import re
25 import sys
26 import optparse
27
28 # NOTE: In comments, PRM is used to refer to the Python Reference
29 # Manual, Python 2.3.2 edition.
30
31 # This is a map from a symbolic style name to a CSS style, allowing us
32 # to change the CSS styles in one convenient place.
33 styles = {
34 'string' : 'color: #006000;',
35 'comment' : 'color: #800000;',
36 'keyword' : 'color: #804000;',
37 'highlight' : 'color: #000080;',
38 }
39
40 ##############################################################################
41
42 # The singleton state object, accessed by the tokenizers.
43 class State(object):
44 """Fields on this object store state that needs to be passed between
45 tokens.
46 """
47
48 def __init__(self):
49
50 # This is used to indicate if the next name should be hilighted.
51 # It's sent to True when def or class are seen, and is used to
52 # hilight the name of the construct being defined.
53 self.highlight_name = False
54
55 # The following take in a matched token, and return a list of
56 # style, text pairs.
57
58 def tok_comment(m, state):
59 """Returns a formatted comment."""
60 return [ ('comment', m.group(0)) ]
61
62
63 def tok_whitespace(m, state):
64 """Returns formatted whitespace."""
65 return [ (None, m.group(0)) ]
66
67 # The set of all keywords defined in python. as and None will become
68 # keywords in a future release, but we include them now.
69 # (PRM 2.3.1)
70 keywords = """
71 and del for is raise
72 assert elif from lambda return
73 break else global not try
74 class except if or while
75 continue exec import pass yield
76 def finally in print as
77 None
78 """.split()
79
80 def tok_name(m, state):
81 """Returns a formatted name."""
82
83 name = m.group(0)
84
85 # If we have a function or class definition, remember to highlight
86 # the next name.
87 if name in ("def", "class"):
88 state.highlight_name = True
89
90 # If we have a keyword, style it.
91 if name in keywords:
92 return [ ("keyword", m.group(0)) ]
93
94 # If we want to highlight the next name, do so.
95 if state.highlight_name:
96 state.highlight_name = False
97 return [ ("highlight", m.group(0)) ]
98
99 # Otherwise, don't style anything.
100 return [ (None, m.group(0)) ]
101
102 def tok_number(m, state):
103 """Returns a formatted number."""
104
105 return [ (None, m.group(0)) ]
106
107 def tok_other(m, state):
108 """Returns a formatted other (this usually matches operators)."""
109
110 return [ (None, m.group(0)) ]
111
112 def tok_string(m, state):
113 """Returns a formatted string."""
114
115 # Style the body of the string, but leave the delimiters unstyled.
116 return [ (None, m.group('start')),
117 ("string", m.group('body')),
118 (None, m.group('end')) ]
119
120 ##########################################################################
121
122 def stringpat(delim):
123 """Given a delimiters, return a regular expression that matches
124 strings delimited by those characters.
125 """
126
127 # How to match characters that can be in the string but are not
128 # the delimiter.
129
130
131 # (?<!\\) asserts that the character is not preceded by a \.
132 # So, this matches the shortest string such that the delimiter
133 # is not preceded by an odd number of backslashes.
134 #
135 # More precisely, we match the shortest string of the form:
136 #
137 # opening
138 # anything
139 # something other than a backslash
140 # any even number of backslashes
141 # closing
142
143 nodelim = r"(.*?(?<!\\)(?:\\\\)*)?"
144
145 # Taken from PRM 2.4.1
146 return ( r"(?P<start>[ru]*%s)" \
147 r"(?P<body>%s)" \
148 r"(?P<end>%s)" \
149 % (delim, nodelim, delim) )
150
151 # This is a list of regular expression, function pairs. It's used to
152 # tokenize the file. When a regular expression matches, the match
153 # object created is passed to the corresponding function, which is
154 # responsible for returning formatted text.
155 patterns = [
156
157 # PRM 2.4.1 (string literals)
158 ( stringpat('"""'), tok_string),
159 ( stringpat('"'), tok_string),
160 ( stringpat("'''"), tok_string),
161 ( stringpat("'"), tok_string),
162
163 # PRM 2.4.6 (imaginary), gives j at the end of the next few patterns.
164
165 # PRM 2.4.5 (floating point)
166 ( r"(\d*\.\d+|\d+\.)(e[+-]?\d+)?j?", tok_number),
167
168 # PRM 2.4.4 (integers)
169 ( r"0x?[0-9a-f]+l?j?", tok_number),
170 ( r"0[0-7]+l?j?", tok_number ),
171 ( r"\d+l?j?", tok_number ),
172
173 # The rest of these are mostly common sense.
174
175 # Names.
176 ( r"\w+", tok_name ),
177 # Whitespace.
178 ( r"\s+", tok_whitespace ),
179 # Comments.
180 ( r"#[^\r\n]*", tok_comment),
181
182 # Everything that doesn't match something else matches here.
183 ( r"[^\w\s\"\'\#]+", tok_other),
184 ]
185
186 # Compile the patterns.
187 patterns = [ (re.compile(regex, re.S | re.I), action)
188 for regex, action in patterns ]
189
190
191 def tokenize(source):
192 """This a generator that tokenizes the source into a a sequence
193 of style, text pairs.
194 """
195
196 lens = len(source)
197 pos = 0
198
199 # Instatiate the state object.
200 state = State()
201
202 # We advance along the string...
203 while pos < lens:
204
205 # ...finding a regex that matches...
206 for regex, action in patterns:
207 m = regex.match(source, pos)
208
209 if m:
210 break
211 else:
212 # If no regex matches, it's an error. (Should never happen.)
213 print >>sys.stderr, repr(source[pos:pos + 200])
214 print >>sys.stderr
215 raise Exception, "Didn't match token."
216
217 # ... and calling the appropriate action with the match and
218 # state objects.
219
220 tokens = action(m, state)
221
222 # This gives us a list of style, text tokens, which we then
223 # yield up for formatting.
224 for style, text in tokens:
225 yield style, text
226
227 # Update the new position in the string to the end of the last
228 # match.
229 pos = m.end()
230
231 ##########################################################################
232
233 def quote(s):
234 """Escapes characters that would otherwise have special meanings
235 in html. Only works for text, not attributes.
236 """
237
238 s = re.sub(r"&", "&", s)
239 s = re.sub(r"<", "<", s)
240
241 return s
242
243 def format(source, out):
244 """Formats the source as html, and sends it to the file out. fn is
245 a filename associated with the source being formatted, which is used
246 to make <a name> anchors for each line number.
247 """
248
249 line = 1
250
251 # This is what we do at the start of each line. (Print out a
252 # line number and anchor.)
253 def newline():
254 if options.line_numbers:
255 out.write('<a name="%d">% 5d</a> ' % (line, line))
256 else:
257 out.write('<a name="%d" /> ' % line)
258
259 newline()
260
261 for style, text in tokenize(source):
262
263 # Further split the text into chunks of text,
264 # separated by newlines.
265 for t in re.split(r"(\n)", text):
266
267 if t == '\n':
268 # If we have a newline, write it out.
269 out.write(t)
270
271 # We have to do this increment here to get the scopes
272 # right.
273 line += 1
274
275 # Write out the line preamble.
276 newline()
277
278 elif style:
279 # If we have a style, emit a quoted and styled block of text.
280 out.write('<span style="%s">%s</span>'
281 % (styles[style], quote(t)))
282
283 else:
284 # Otherwise, emit a block of text that is quoted, but not
285 # styled.
286 out.write(quote(t))
287
288
289 def main():
290
291 # Parse the options.
292
293 usage = """%prog <files>
294
295 For each python source file listed in <files>, generates file.html
296 containing colorized html, anchors, and optional line numbers.
297 """
298
299 op = optparse.OptionParser(usage=usage, version="%prog 1")
300 op.add_option("-n", "--line-numbers",
301 action="store_true", dest="line_numbers", default=False,
302 help="Prefix each line with its line number.")
303
304 global options
305 options, args = op.parse_args()
306
307 if len(args) == 0:
308 op.error('You must specify at least one file to process.')
309
310
311 # Iterate over the files, converting each to html.
312
313 for fn in args:
314 print fn
315
316 # Open the input and output files.
317 f = file(fn, "rU")
318 out = file(fn + ".html", "wU")
319
320 # Write out a quick html header.
321 out.write("<html><head><title>%s</title></head><body>\n" % fn)
322 out.write("<pre>\n")
323
324 # Format the python source.
325 format(f.read(), out)
326
327 # Write out a quick html footer.
328 out.write("</pre>\n")
329 out.write("</body></html>\n")
330
331 out.close()
332 f.close()
333
334
335 if __name__ == "__main__":
336 main()
337