Great tool, indeed! But doc strings stay in the source text. If you do need to remove doc strings as well, add the following into the __call__ method.
... # kill doc strings ... if not self.docstrings: ... if toktype == tokenize.STRING and len(toktext) >= 6: ... t = toktext.lstrip('rRuU') ... if ((t.startswith("'''") and t.endswith("'''")) or ... (t.startswith('"""') and t.endswith('"""'))): ... return as shown in the original post below. Also, set self.docstrings in the format method, similar to self.comments as shown below in lines starting with '...'. /Jean Brouwers M.E.Farmer wrote: > qwweeeit wrote: > > Thanks! If you answer to my posts one more time I could consider you > as > > my tutor... > > > > It was strange to have found a bug...! In any case I will not go > deeper > > into the matter, because for me it's enough your explanatiom. > > I corrected the problem by hand removing the tokens spanning multiple > lines > > (there were only 8 cases...). > > > > Instead I haven't understood your hint about comments... > > I succeded in realizing a python script which removes comments. > > > > Here it. is (in all its cumbersome and criptic appearence!...): > > > > # removeCommentsTok.py > > import tokenize > > Input = "pippo1" > > Output = "pippo2" > > f = open(Input) > > fOut=open(Output,"w") > > > > nLastLine=0 > > for i in tokenize.generate_tokens(f.readline): > > . if i[0]==52 and nLastLine != (i[2])[0]: > > . . fOut.write((i[4].replace(i[1],'')).rstrip()+'\n') > > . . nLastLine=(i[2])[0] > > . elif i[0]==4 and nLastLine != (i[2])[0]: > > . . fOut.write((i[4])) > > . . nLastLine=(i[2])[0] > > f.close() > > fOut.close() > > > > Some explanations for the guys like me...: > > - 52 and 4 are the arbitrary codes for comments and NEWLINE > respectively > > - the comment removing is obtained by clearing the comment (i[1]) in > the > > input line (i[4]) > > - I also right trimmed the line to get rid off the remaining blanks. > Tokenizer sends multiline strings and comments as a single token. > > ###################################################################### > # python comment and whitespace stripper :) > ###################################################################### > > import keyword, os, sys, traceback > import StringIO > import token, tokenize > __credits__ = 'just another tool that I needed' > __version__ = '.7' > __author__ = 'M.E.Farmer' > __date__ = 'Jan 15 2005, Oct 24 2004' > > ###################################################################### > > class Stripper: > """python comment and whitespace stripper :) > """ > def __init__(self, raw): > self.raw = raw > ... def format(self, out=sys.stdout, comments=0, docstrings=0, spaces=1, > untabify=1, eol='unix'): > ''' strip comments, strip extra whitespace, > convert EOL's from Python code. > ''' > # Store line offsets in self.lines > self.lines = [0, 0] > pos = 0 > # Strips the first blank line if 1 > self.lasttoken = 1 > self.temp = StringIO.StringIO() > self.spaces = spaces > self.comments = comments ... self.docstrings = docstrings > > if untabify: > self.raw = self.raw.expandtabs() > self.raw = self.raw.rstrip()+' ' > self.out = out > > self.raw = self.raw.replace('\r\n', '\n') > self.raw = self.raw.replace('\r', '\n') > self.lineend = '\n' > > # Gather lines > while 1: > pos = self.raw.find(self.lineend, pos) + 1 > if not pos: break > self.lines.append(pos) > > self.lines.append(len(self.raw)) > # Wrap text in a filelike object > self.pos = 0 > > text = StringIO.StringIO(self.raw) > > # Parse the source. > ## Tokenize calls the __call__ > ## function for each token till done. > try: > tokenize.tokenize(text.readline, self) > except tokenize.TokenError, ex: > traceback.print_exc() > > # Ok now we write it to a file > # but we also need to clean the whitespace > # between the lines and at the ends. > self.temp.seek(0) > > # Mac CR > if eol == 'mac': > self.lineend = '\r' > # Windows CR LF > elif eol == 'win': > self.lineend = '\r\n' > # Unix LF > else: > self.lineend = '\n' > > for line in self.temp.readlines(): > if spaces == -1: > self.out.write(line.rstrip()+self.lineend) > else: > if not line.isspace(): > self.lasttoken=0 > self.out.write(line.rstrip()+self.lineend) > else: > self.lasttoken+=1 > if self.lasttoken<=self.spaces and self.spaces: > self.out.write(self.lineend) > > > def __call__(self, toktype, toktext, > (srow,scol), (erow,ecol), line): > ''' Token handler. > ''' > # calculate new positions > oldpos = self.pos > newpos = self.lines[srow] + scol > self.pos = newpos + len(toktext) > > #kill the comments > if not self.comments: > # Kill the comments ? > if toktype == tokenize.COMMENT: > return > ... # kill doc strings ... if not self.docstrings: ... if toktype == tokenize.STRING and len(toktext) >= 6: ... t = toktext.lstrip('rRuU') ... if ((t.startswith("'''") and t.endswith("'''")) or ... (t.startswith('"""') and t.endswith('"""'))): ... return > # handle newlines > if toktype in [token.NEWLINE, tokenize.NL]: > self.temp.write(self.lineend) > return > > # send the original whitespace, if needed > if newpos > oldpos: > self.temp.write(self.raw[oldpos:newpos]) > > # skip indenting tokens > if toktype in [token.INDENT, token.DEDENT]: > self.pos = newpos > return > > # send text to the temp file > self.temp.write(toktext) > return > ###################################################################### > > def Main(): > import sys > if sys.argv[1]: > filein = open(sys.argv[1]).read() > Stripper(filein).format(out=sys.stdout, comments=1, untabify=1, > eol='win') > > ###################################################################### > > if __name__ == '__main__': > Main() > > M.E.Farmer -- http://mail.python.org/mailman/listinfo/python-list