Here is the code to deal with 1 given file (the code to iterate all the files as working, and I will glue both together when the second one does what I want.):
It's a little long, but I wanted to put it all so you maybe I can get some tips to speed things up because it's pretty slow. import BeautifulSoup, tidy file = open("index.htm", "r") soup = BeautifulSoup.BeautifulSoup(file) file.close() #remove unnecessary things (scripts, styles, ...) for script in soup("script"): soup.script.extract() for style in soup("style"): soup.style.extract() #remove comments comments = soup.findAll(text=lambda text:isinstance(text, BeautifulSoup.Comment)) [comment.extract() for comment in comments] #the following removes things specific to the pages I'm working with, don't mind the langcanada things #I was just too lazy to change the name of this variable each time #I think this is an area that could be done differently to get more speed langcanada = soup.findAll("img", src="graphics/button_f.jpg") [img.parent.parent.extract() for img in langcanada] langcanada = soup.findAll("img", src="graphics/button_e.jpg") [img.parent.parent.extract() for img in langcanada] langcanada = soup.findAll("img", src="http://u1.extreme-dm.com/i.gif") [img.parent.parent.extract() for img in langcanada] langcanada = soup.findAll("a", href="research/disclaimer.htm") [img.parent.extract() for img in langcanada] comments = soup.findAll(text=" ") [comment.extract() for comment in comments] langcanada = soup.findAll("img", id="logo") [img.parent.parent.parent.extract() for img in langcanada] langcanada = soup.findAll("img", id="about") [img.parent.parent.parent.extract() for img in langcanada] langcanada = soup.findAll("img", src="images/navbgrbtm.jpg") [img.parent.parent.parent.parent.extract() for img in langcanada] langcanada = soup.findAll("img", src="images/navbgrtop.jpg") [img.parent.parent.parent.parent.extract() for img in langcanada] #delete class attributes for divs in range(len(soup.findAll("div"))): le_div = soup.findAll("div")[divs] del le_div["class"] for paras in range(len(soup.findAll("p"))): le_par = soup.findAll("p")[paras] del (le_par["class"]) for imgs in range(len(soup.findAll("img"))): le_img = soup.findAll("img")[imgs] del (le_img["hspace"]) del (le_img["vspace"]) del (le_img["border"]) # Add some class attributes for h1s in range(len(soup.findAll("h1"))): le_h1 = soup.findAll("h1")[h1s] le_h1["class"] = "heading1_main" for h2s in range(len(soup.findAll("h2"))): le_h2 = soup.findAll("h2")[h2s] le_h2["class"] = "heading2_main" for h3s in range(len(soup.findAll("h3"))): le_h3 = soup.findAll("h3")[h3s] le_h3["class"] = "heading3_main" for h4s in range(len(soup.findAll("h4"))): le_h4 = soup.findAll("h4")[h4s] le_h4["class"] = "heading4_main" for h5s in range(len(soup.findAll("h5"))): le_h5 = soup.findAll("h5")[h5s] le_h5["class"] = "heading5_main" # links, makes difference between internal and external ones for links in range(len(soup.findAll("a"))): le_link = soup.findAll("a")[links] le_href = le_link["href"] if le_href.startswith("""http://caslt.org""") or le_href.startswith("""http://www.caslt.org"""): le_link["class"] = "caslt_link" elif le_href.startswith("""http://"""): le_link["class"] = "external_link" else: le_link["class"] = "caslt_link" del (soup.body["onload"]) # This is what needs to be done: ###### change tables to divs ###### remove all td tags ###### remove all tr tags # Tidying soup = soup.prettify() erreurs = "" tidy_options = {"tidy-mark": 0, "wrap": 0, "wrap-attributes": 0, "indent": "auto", "output-xhtml": 1, "doctype": "loose", "input-encoding": "utf8", "output-encoding": "utf8", "break-before-br": 1, "clean": 1, "logical-emphasis": 1, "drop-font-tags": 1, "enclose-text": 1, "alt-text": " ", "write-back": 1, "error-file": erreurs, "show-warnings": 0, "quiet": 1, "drop-empty-paras": 1, "drop-proprietary-attributes": 1, "join-classes": 1, "join-styles": 1, "show-body-only": 1, "word-2000": 1, "force-output": 1} soup_tidy = tidy.parseString(soup, **tidy_options) outputfile = open("index2.htm", "w") outputfile.write(str(soup_tidy)) outputfile.close() Alan Gauld wrote: > "Sebastien Noel" <[EMAIL PROTECTED]> wrote > > >> My question, since I'm quite new to python, is about what tool I >> should >> use to remove the table, tr and td tags, but not what's enclosed in >> it. >> I think BeautifulSoup isn't good for that because it removes what's >> enclosed as well. >> > > BS can do what you want, you must be missing something. One of the > most basic examples of using BS is to print an HTML file as plain text > - ie stripping just the tags. So it must be possible. > > Can you put together a short example of the code you are using? > > You an use lower level parsers but BS is geneally easier, but until > we know what you are doing its hard to guess what might be wrong. > > Alan G. > > > _______________________________________________ > Tutor maillist - Tutor@python.org > http://mail.python.org/mailman/listinfo/tutor > > _______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor