MrJean1 wrote: > There is an issue with both my and your code: it only works if doc > strings are triple quoted and if there are no other triple quoted > strings in the Python code. I had not considered single quoted strings ;) > A triple quoted string used in an assignment will be removed, for > example this case > > s = '''this string should not be removed''' > > > It is still unclear how to distinguish doc strings from other strings. > Also, I have not checked the precise Python syntax, but doc strings do > not need to be enclosed by triple quotes. A single quote may be > allowed too. > > Maybe this rule will work: a doc string is any string preceded by a > COLON token followed by zero, one or more INDENT or NEWLINE tokens. > Untested! Not needed , if you reread my post I explain that I had solved that issue. If you use the line argument that tokenizer supplies we can strip whitespace and 'rRuU' from the start of the line and look for a single quote or a double quote . I have tested it and it works. Reworked the 'pep8' thing and fixed the bug you mentioned here is the changes.
> ###################################################################### > > # Python source stripper > > > ###################################################################### > > > > import os > > import sys > > import token > > import keyword > > import StringIO > > import tokenize > > import traceback > > __credits__ = ''' > > Jürgen Hermann > > M.E.Farmer > > Jean Brouwers > > ''' > > __version__ = '.8' > > __author__ = 'M.E.Farmer' > > __date__ = 'Apr 16, 2005,' \ > > 'Jan 15 2005,' \ > > 'Oct 24 2004' \ > > > > > > > ###################################################################### > > > > class Stripper: > > """Python source stripper > > """ > > def __init__(self, raw): > > self.raw = raw > > > > def format(self, out=sys.stdout, comments=0, docstrings=0, > > spaces=1, untabify=1, eol='unix'): > > """ strip comments, > > strip docstrings, > > strip extra whitespace and lines, > > convert tabs to spaces, > > convert EOL's in Python code. > > """ > > # Store line offsets in self.lines > > self.lines = [0, 0] > > pos = 0 > > # Strips the first blank line if 1 > > self.lasttoken = 1 > > self.temp = StringIO.StringIO() > > self.spaces = spaces > > self.comments = comments > > self.docstrings = docstrings > > > > if untabify: > > self.raw = self.raw.expandtabs() > > self.raw = self.raw.rstrip()+' ' > > self.out = out > > > > # Have you ever had a multiple line ending script? > > # They can be nasty so lets get them all the same. > > self.raw = self.raw.replace('\r\n', '\n') > > self.raw = self.raw.replace('\r', '\n') > > self.lineend = '\n' > > > > # Gather lines > > while 1: > > pos = self.raw.find(self.lineend, pos) + 1 > > if not pos: break > > self.lines.append(pos) > > > > self.lines.append(len(self.raw)) > > self.pos = 0 > > > > # Wrap text in a filelike object > > text = StringIO.StringIO(self.raw) > > > > # Parse the source. > > ## Tokenize calls the __call__ > > ## method for each token till done. > > try: > > tokenize.tokenize(text.readline, self) > > except tokenize.TokenError, ex: > > traceback.print_exc() > > > > # Ok now we write it to a file > > # but we also need to clean the whitespace > > # between the lines and at the ends. > > self.temp.seek(0) > > > > # All this should be written into the > > # __call__ method just haven't yet... > > > > # Mac CR > > if eol == 'mac': > > self.lineend = '\r' > > # Windows CR LF > > elif eol == 'win': > > self.lineend = '\r\n' > > # Unix LF > > else: > > self.lineend = '\n' > > > > for line in self.temp.readlines(): > > if spaces == -1: > > self.out.write(line.rstrip()+self.lineend) > > else: > > if not line.isspace(): > > self.lasttoken=0 > > self.out.write(line.rstrip()+self.lineend) > > else: > > self.lasttoken+=1 > > if self.lasttoken<=self.spaces and self.spaces: > > self.out.write(self.lineend) > > > > def __call__(self, toktype, toktext, > > (srow,scol), (erow,ecol), line): > > """ Token handler. > > """ > > # calculate new positions > > oldpos = self.pos > > newpos = self.lines[srow] + scol > > self.pos = newpos + len(toktext) > > > > # kill comments > > if self.comments: > > if toktype == tokenize.COMMENT: > > return > > # kill doc strings if self.docstrings: # Assume if there is nothing on the # left side it must be a docstring if toktype == tokenize.STRING and \ line.lstrip(' rRuU')[0] in ["'",'"']: t = toktext.lstrip('rRuU') # pep8 frowns on triple single quotes if ( self.docstrings == 'pep8' or self.docstrings == 8): # pep8 frowns on single triples if not t.startswith('"""'): return else: return # handle newlines if toktype in [token.NEWLINE, tokenize.NL]: self.temp.write(self.lineend) return # send the original whitespace if newpos > oldpos: self.temp.write(self.raw[oldpos:newpos]) # skip indenting tokens if toktype in [token.INDENT, token.DEDENT]: self.pos = newpos return # send text to the temp file self.temp.write(toktext) return ###################################################################### def Main(): import sys if sys.argv[1]: filein = open(sys.argv[1]).read() Stripper(filein).format(out=sys.stdout, comments=0, docstrings='pep8', untabify=1, eol='win') ###################################################################### if __name__ == '__main__': Main() That should work like a charm for all types of docstrings without disturbing others strings. M.E.Farmer -- http://mail.python.org/mailman/listinfo/python-list