On Dec 12, 3:56 pm, Ramdas <[EMAIL PROTECTED]> wrote: > I am doing some HTML scrapping for a side project. > > I need a method using sgmllib or HTMLParser to parse an HTML file and > get line nos of all the tags > Homework, perhaps? Well, I don't think your instructor will give many points for a pyparsing solution, but it was an interesting 10-minute exercise anyway. Once you use pyparsing's built-in expression for anyOpenTag, and scan through the input html, the rest is just bookkeeping in a defaultdict.
-- Paul import urllib from collections import defaultdict from pyparsing import anyOpenTag, lineno # read in a random html page pg = urllib.urlopen("http://www.yahoo.com") html = pg.read() pg.close() # print out what we got print html print # create a defaultdict to tally up list of line numbers for each tag tagLocs = defaultdict(list) # use a parse action to update the tally whenever a tag is found def tallyTagLineNumber(strg, locn, tagTokens): line = lineno(locn,strg) tagLocs[tagTokens[0]].append(line) anyOpenTag.setParseAction(tallyTagLineNumber) # scan the input html, and add tag line numbers to the tally dict anyOpenTag.searchString(html) # print out the results tagnames = sorted(tagLocs.keys()) for t in tagnames: print t, len(tagLocs[t]) print tagLocs[t] print ------- Prints: <... extracted HTML not shown...> a 46 [54, 68, 96, 97, 98, 99, 110, 111, 112, 113, 114, 115, 116, 117, 120, 121, 122, 123, 124, 125, 126, 127, 130, 131, 132, 133, 134, 135, 136, 139, 140, 141, 142, 143, 150, 159, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170] b 5 [91, 109, 119, 129, 138] base 1 [6] body 1 [17] br 34 [91, 93, 93, 94, 110, 111, 112, 113, 114, 115, 116, 117, 120, 121, 122, 123, 124, 125, 126, 127, 130, 131, 132, 133, 134, 135, 136, 139, 140, 141, 142, 143, 167, 167] center 1 [18] font 15 [30, 36, 54, 68, 90, 96, 97, 98, 99, 109, 119, 129, 138, 158, 168] form 1 [31] head 1 [2] html 1 [1] img 2 [26, 150] input 5 [32, 33, 34, 36, 37] meta 2 [4, 5] spacer 26 [21, 24, 47, 50, 52, 53, 55, 56, 58, 61, 64, 66, 67, 69, 70, 72, 79, 82, 83, 89, 101, 104, 146, 148, 156, 157] span 1 [39] style 2 [7, 12] table 26 [19, 21, 29, 30, 35, 46, 49, 54, 63, 68, 79, 82, 83, 84, 87, 88, 95, 106, 107, 108, 146, 148, 149, 156, 157, 158] td 58 [20, 21, 24, 25, 28, 29, 30, 36, 37, 45, 47, 48, 50, 52, 53, 54, 54, 55, 56, 58, 61, 62, 64, 66, 67, 68, 68, 69, 70, 72, 78, 79, 82, 83, 86, 87, 89, 90, 96, 97, 98, 99, 101, 104, 105, 106, 107, 109, 119, 129, 138, 146, 148, 149, 156, 157, 157, 158] title 1 [3] tr 37 [20, 21, 23, 29, 30, 35, 44, 46, 49, 51, 54, 57, 63, 65, 68, 71, 78, 79, 82, 83, 85, 87, 88, 96, 97, 98, 99, 106, 107, 108, 146, 148, 149, 156, 157, 157, 158] -- http://mail.python.org/mailman/listinfo/python-list