#------------------------------------------------------------------------ import re, sys
def q(c): """Returns a regular expression that matches a region delimited by c, inside which c may be escaped with a backslash""" return r"%s(\\.|[^%s])*%s" % (c, c, c) single_quoted_string = q('"') double_quoted_string = q("'") c_comment = r"/\*.*?\*/" cxx_comment = r"//[^\n]*[\n]" rx = re.compile("|".join([single_quoted_string, double_quoted_string, c_comment, cxx_comment]), re.DOTALL) def replace(x): x = x.group(0) if x.startswith("/"): return ' ' return x result = rx.sub(replace, sys.stdin.read()) sys.stdout.write(result) #------------------------------------------------------------------------ The regular expression matches ""-strings, ''-character-constants, c-comments, and c++-comments. The replace function returns ' ' (space) when the matched thing was a comment, or the original thing otherwise. Depending on your use for this code, replace() should return as many '\n's as are in the matched thing, or ' ' otherwise, so that line numbers remain unchanged. Basically, the regular expression is a tokenizer, and replace() chooses what to do with each recognized token. Things not recognized as tokens by the regular expression are left unchanged. Jeff PS this is the test file I used: /* ... */ xyzzy; 456 // 123 const char *mystr = "This is /*trouble*/"; /* * */ /* /* */ // /* /* */ /* // /* */ /* * */
pgp0CcH5aHF1o.pgp
Description: PGP signature
-- http://mail.python.org/mailman/listinfo/python-list