I'm using Python2.5 to try and convert some text files into XML using xml.minidom. I'm currently doing some plays which have a structure like Scene 1 Act 1 blah blah Act2 blah blah Scene 2 Act 1 and so on. I'm trying to turn it into <div type="scene">1 <div type="act">1 <speech /> </div> <div type="act">2 <speech /> </div> </div> (or ideally <div type="scene" id="1"> bit I can always come back to this later) I've currently got: <div id="" type="scene"> <div id=" " type="act"> <speech> II </speech> </div> </div> <div id="" type="scene"> <div id=" " type="act"> <speech> II </speech> </div> </div> <div id="" type="scene"> <div id=" " type="act"> The code I'm currently working with is: from itertools import groupby from xml.dom.minidom import Document
import re text = open('\\texts\\midsummer_nights_dream_gut.txt').read() def paragraphs(lines, is_separator=str.isspace, joiner=''.join): for separator_group, lineiter in groupby(lines, key=is_separator): if not separator_group: yield joiner(lineiter) def scene_node(scene): global docText docText = doc.createElement("div") #need to set the type to book, verse, drama docText.setAttribute("type", "scene") #need set the id to what ever break name or id: i.e. chapter 1 or act 1 docText.setAttribute("id", '') tei.appendChild(docText) for acts in actTxt.split(scene): act_node(acts) def act_node(act): global actText actText = doc.createElement("div") #need to set the type to book, verse, drama actText.setAttribute("type", "act") #need set the id to what ever id: 1 or I actText.setAttribute("id", ' ') docText.appendChild(actText) for p in paragraphs(act.splitlines(True)): speech_node(p) def speech_node(speech): para = doc.createElement("speech") actText.appendChild(para) ptext = doc.createTextNode(speech) para.appendChild(ptext) doc = Document() tei = doc.createElement("body") doc.appendChild(tei) sideTxt = re.compile(r"Scene\s+([1-9])", re.I) actTxt = re.compile(r"Act\s+([1-9])", re.I) for textStr in sideTxt.split(text): scene_node(textStr) print doc.toprettyxml(indent = " ") I'd be grateful for some pointers about getting a cleaner output. Thanks, Iain -- http://mail.python.org/mailman/listinfo/python-list