But continuing... you might want to think about this in a step back. Each of the self.hN and self.in_hN have something in common and they all have the same behavior. That sounds a lot like a job for "object oriented programming", no? We can create a class that look and acts like a list (like h N), but is only active if we have set it (if in_hN is True).
Actually, because of the structure of the SGML code, "BAD CODE1" isn't quite the "bad code", the "handle_data" code is actually worse. The reason "BAD CODE1" looks bad is not because of your code, but because SGMLParser forces you to create so many methods in the subclass. There are no "start_hN" and "end_hN" catch-all methods available. For this reason, I made only a minor change to the "start_h N" and "end_hN" methods, but changed the reset and handle_data methods quite a bit.
class HeaderCapture:
def __init__(self, contents=[]):
self.contents = contents[:] # copy
self.deactivate()
def append(self, item):
# could raise an exception, but for now, ignore
if self.active:
self.contents.append(item)
def __len__(self):
return len(self.contents)
def __getitem__(self, idx):
return self.contents[idx]
def activate(self):
self.active = True
def deactivate(self):
self.active = False
...
class Lister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.headers = {
'h1': HeaderCapture(),
'h2': HeaderCapture(),
'h3': HeaderCapture(),
'h4': HeaderCapture(),
'h5': HeaderCapture(),
'h6': HeaderCapture(),
}
def handle_data(self, text):
# only one would be active, but legally, two could
for hc in self.headers.values():
hc.append (text) # if not active, ignore
def start_h1(self, attrs):
self.headers['h1'].activate()
def end_h1(self):
self.headers['h1'].deactivate()
def start_h2(self, attrs):
self.headers['h2'].activate()
def end_h2(self):
self.headers['h2'].deactivate()
def start_h3(self, attrs):
self.headers['h3'].activate()
def end_h3(self):
self.headers['h3'].deactivate()
def start_h4(self, attrs):
self.headers['h4'].activate()
def end_h4(self):
self.headers['h4'].deactivate()
def start_h5(self, attrs):
self.headers['h5'].activate()
def end_h5(self):
self.headers['h5'].deactivate()
def start_h6(self, attrs):
self.headers['h6'].activate()
def end_h6(self):
self.headers['h6'].deactivate()
But again, like others have suggested, you should rethink your problem and your solution before starting down your path. What are you really capturing?
Rethink problem...I try to use sgmllib - get all info tagged in "h1"... "h6"I've created file lister.py:"from sgmllib import SGMLParserclass Lister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.h1 = []
self.h2 = []
self.h3 = []
self.h4 = []
self.h5 = []
self.h6 = []self.in_h1 = False
self.in_h2 = False
self.in_h3 = False
self.in_h4 = False
self.in_h5 = False
self.in_h6 = Falsedef handle_data(self, text):
if self.in_h1 == True:
self.h1.append(text)
elif self.in_h2 == True:
self.h2.append(text)
elif self.in_h3 == True:
self.h3.append(text)
elif self.in_h4 == True:
self.h4.append(text)
elif self.in_h5 == True:
self.h5.append(text)
elif self.in_h6 == True:
self.h6.append(text)#AND NOW "BAD CODE1":
def start_h1(self, attrs):
self.in_h1 = Truedef end_h1(self):
self.in_h1 = Falsedef start_h2(self, attrs):
self.in_h2 = Truedef end_h2(self):
self.in_h2 = Falsedef start_h3(self, attrs):
self.in_h3 = Truedef end_h3(self):
self.in_h3 = Falsedef start_h4(self, attrs):
self.in_h4 = Truedef end_h4(self):
self.in_h4 = Falsedef start_h5(self, attrs):
self.in_h5 = Truedef end_h5(self):
self.in_h5 = Falsedef start_h6(self, attrs):
self.in_h6 = Truedef end_h6(self):
self.in_h6 = False"
And now I want to print all text in this tags.
file use_lister.py:
"
import urllib, lister
f = open('_1.html', 'r')
text = f.read()
f.close()parser = urllister.Lister()
parser.feed(text)
parser.close()#AND NOW "BAD CODE2":
Show_step('h1')
for i in parser.h1:
print iShow_step('h2')
for i in parser.h2:
print iShow_step('h3')
for i in parser.h3:
print iShow_step('h4')
for i in parser.h4:
print iShow_step('h5')
for i in parser.h5:
print iShow_step('h6')
for i in parser.h6:
print i"
And I don't like this "BAD CODE1" and "BAD CODE2"
How to rewrite bad codes???
_______________________________________________
Tutor maillist - Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor
--
There's so many different worlds,
So many different suns.
And we have just one world,
But we live in different ones.
_______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor