Attached is another version of the stripper.py file. It contains my change which seem to handle docstring correctly (at least on itself).
/Jean Brouwers <pre> ###################################################################### # Python source stripper / cleaner ;) ###################################################################### import os import sys import token import keyword import StringIO import tokenize import traceback __credits__ = \ ''' J¸rgen Hermann M.E.Farmer Jean Brouwers ''' __version__ = '.8' __author__ = 'M.E.Farmer' __date__ = 'Apr 16, 2005,' \ 'Jan 15 2005,' \ 'Oct 24 2004' \ '''this docstring should be removed ''' ###################################################################### class Stripper: """Python source stripper / cleaner """ def __init__(self, raw): self.raw = raw def format(self, out=sys.stdout, comments=0, docstrings=0, spaces=1, untabify=1, eol='unix'): """ strip comments, strip docstrings, strip extra whitespace and lines, convert tabs to spaces, convert EOL's in Python code. """ # Store line offsets in self.lines self.lines = [0, 0] pos = 0 self.temp = StringIO.StringIO() # Strips the first blank line if 1 self.lasttoken = 1 self.spaces = spaces # 0 = no change, 1 = strip 'em self.comments = comments # yep even these # 0 = no change, 1 = strip 'em, 8 or 'pep8'= strip all but """'s self.docstrings = docstrings if untabify: self.raw = self.raw.expandtabs() self.raw = self.raw.rstrip()+' ' self.out = out # Have you ever had a multiple line ending script? # They can be nasty so lets get them all the same. self.raw = self.raw.replace('\r\n', '\n') self.raw = self.raw.replace('\r', '\n') self.lineend = '\n' # Gather lines while 1: pos = self.raw.find(self.lineend, pos) + 1 if not pos: break self.lines.append(pos) self.lines.append(len(self.raw)) self.pos = 0 self.lastOP = '' # Wrap text in a filelike object text = StringIO.StringIO(self.raw) # Parse the source. ## Tokenize calls the __call__ ## method for each token till done. try: tokenize.tokenize(text.readline, self) except tokenize.TokenError, ex: traceback.print_exc() # Ok now we write it to a file # but we also need to clean the whitespace # between the lines and at the ends. self.temp.seek(0) # All this should be written into the # __call__ method just haven't yet... # Mac CR if eol == 'mac': self.lineend = '\r' # Windows CR LF elif eol == 'win': self.lineend = '\r\n' # Unix LF else: self.lineend = '\n' for line in self.temp.readlines(): if spaces == -1: self.out.write(line.rstrip()+self.lineend) else: if not line.isspace(): self.lasttoken=0 self.out.write(line.rstrip()+self.lineend) else: self.lasttoken+=1 if self.lasttoken<=self.spaces and self.spaces: self.out.write(self.lineend) def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line): """ Token handler. """ # calculate new positions oldpos = self.pos newpos = self.lines[srow] + scol self.pos = newpos + len(toktext) ##print "*token: %s text: %r line: %r" % \ (token.tok_name[toktype], toktext, line) # kill comments if self.comments: if toktype == tokenize.COMMENT: return # kill doc strings if self.docstrings: # a STRING must be a docstring # if the most recent OP was ':' if toktype == tokenize.STRING and self.lastOP == ':': # pep8 frowns on triple single quotes if (self.docstrings == 'pep8' or self.docstrings == 8): if not toktext.endswith('"""'): return else: return elif toktype == token.OP: # remember most recent OP self.lastOP = toktext elif self.lastOP == ':': # newline and indent are OK inside docstring if toktype not in [token.NEWLINE, token.INDENT]: # otherwise the docstring ends self.lastOP = '' elif toktype == token.NEWLINE: # consider any string starting # on a new line as a docstring self.lastOP = ':' # handle newlines if toktype in [token.NEWLINE, tokenize.NL]: self.temp.write(self.lineend) return # send the original whitespace if newpos > oldpos: self.temp.write(self.raw[oldpos:newpos]) # skip indenting tokens if toktype in [token.INDENT, token.DEDENT]: self.pos = newpos return # send text to the temp file self.temp.write(toktext) return ###################################################################### def Main(): import sys if sys.argv[1]: filein = open(sys.argv[1]).read() Stripper(filein).format(out=sys.stdout, comments=1, docstrings=1, untabify=1, eol='win') ###################################################################### if __name__ == '__main__': Main() </pre> M.E.Farmer wrote: > I found the bug and hope I have squashed it. > Single and qouble quoted strings that were assignments and spanned > multilines using \ , were chopped after the first line. > example: > __date__ = 'Apr 16, 2005,' \ > 'Jan 15 2005,' \ > 'Oct 24 2004' > became: > __date__ = 'Apr 16, 2005,' \ > > Not good :( > > tokenizer sends this as: > name > operator > string > string > string > newline > > I added test for string assignments that end in \. > A flag is set and then all strings till a newline are ignored. > Also rearranged the script a little. > Maybe that will do it ... > Updates available at > > The script is located at: > > http://bellsouthpwp.net/m/e/mefjr75/python/stripper.py > > > > M.E.Farmer -- http://mail.python.org/mailman/listinfo/python-list