I seem to have a thing for writing "simple" markup languages that get translated to HTML, because HTML is both too slow to type and too hard to read for my taste. I started doing this in 1994 with a bunch of m4 macros called "htm4". This hideous mess of kludges is the fourth one, and I like its input language best of any of them so far. Several items in the queue to be posted to kragen-tol are in the format this program accepts.
It uses a couple of modules from the MetaPy package, which is available from http://pobox.com/~kragen/sw/MetaPy-6.tar.gz and has been posted to this list. #!/usr/local/bin/python # parse this random text format I wrote some article on Arc in. # format includes: # blank lines separate paragraphs (ok) # - signs at the beginnings of lines of text indicate bullets (ok) # lines of - signs indicate that the previous line is a header (ok) # a line of = signs indicates the title of the document (ok) # \(foo bar baz) is a foo tag around 'bar baz', which must have # balanced parentheses within it.. Tags include ul, href, and i. # ul and i translate to the corresponding HTML tags; \(href url bar baz) # translates to <a href="url">bar baz</a>. # Indents of four or more spaces indicate blockquotes. (ok) import sys, string, MetaPy.variant, MetaPy.Iterate linetoken = MetaPy.variant.defvariant(line=["text"], blankline=[], dashes=[], bullet=[], indent=[], outdent=[], equalses=[], eof=[]) def htmlquote(s): return string.replace( string.replace( string.replace( string.replace(s, "&", "&"), '"', """), "<", "<"), ">", ">") class reallyparse(MetaPy.variant.Patmatch): """This state machine transforms a sequence of linetokens into XHTML.""" def __init__(self): self.para = None self.lastline = None self.indented = 0 self.in_li = 0 self.in_li_start = 0 self.parenstack = [] self.inpara = 0 # how to handle paragraphs? The problem is that what we're passed might # be one paragraph, none, or two, if it's declared to be a paragraph # at all. # <ul></ul> tags in the middle might break up text. # If there's no text other than those tags, we don't have a paragraph # at all, despite what the other function thought. # If there's text before the <ul> or after the </ul>, we have a single # paragraph of that stuff, but it won't include the <ul> or </ul>. # There can actually be text on both sides of the </ul>. # Also, list items get handled specially; the first paragraph in a # list item doesn't get <p> tags, but later ones do. def munge_text(self, text, para=0): """Here's how we handle tags. Just before we print text, we run it through this function. It keeps track of paren nesting levels more or less independent of everything else in the universe, replaces opening and closing tags and, where necessary, replaces entire tag contents. (This last clearly won't always work where the tag contents cross multiple text chunks, but that would produce invalid HTML anyway, because only the href tag does that.) """ rv = [] def rvadd(text, rv=rv): if not all(isspace, text): rv.append(htmlquote(text)) # find () quickly, that's all we care about lparens = [] rparens = [] ii = -1 while 1: ii = string.find(text, '(', ii+1) if ii == -1: break lparens.append(ii) ii = -1 while 1: ii = string.find(text, ')', ii+1) if ii == -1: break rparens.append(ii) parens = MetaPy.Iterate.itermerge(lparens, rparens) lasti = 0 for ii in parens: if text[ii] == '(': if ii > 0 and text[ii-1] == '\\': # it's a tag rvadd(text[lasti:ii-1]) space = findspace(text, ii) if space == -1: space = len(text) tagname = text[ii+1:space] ii = space # for 'lasti' below if tagname in ('i', 'ul', 'b'): rv.append("<%s>" % tagname) self.parenstack.append(tagname) elif tagname == 'href': space = findspace(text, ii+1) if space == -1: space = len(text) url = text[ii+1:space] ii = space rv.append('<a href="%s">' % htmlquote(url)) self.parenstack.append('a') else: raise "Unknown tag", tagname else: # it's a left paren rvadd(text[lasti:ii+1]) self.parenstack.append('(') elif text[ii] == ')': rvadd(text[lasti:ii]) if not self.parenstack: raise "Too many close parens" closing = self.parenstack.pop() if closing in ('i', 'b', 'a', 'ul'): rv.append("</%s>" % closing) elif closing == '(': rv.append(')') else: raise "Weird closing %s" % closing lasti = ii+1 rvadd(text[lasti:]) # now we run through and add paragraph markers if need be if para: if '<ul>' in rv: ulindex = rv.index('<ul>') if ulindex != 0: rv.insert(ulindex, '</p>') rv.insert(0, '<p>') elif '</ul>' in rv: ulindex = rv.index('</ul>') if ulindex != len(rv)-1: rv.insert(-1, "</p>") rv.insert(ulindex+1, "<p>") if self.in_li: rv.insert(ulindex, "</li>") if not self.in_li_start and ulindex != 0: rv.insert(ulindex, "</p>") rv.insert(0, "<p>") self.in_li = 0 self.in_li_start = 0 elif not self.in_li_start: rv.insert(0, "<p>") rv.append("</p>") else: self.in_li_start = 0 return string.join(rv, '') def when_line(self, (text,)): if not self.indented: self.put_last_line_in_para() self.lastline = text else: # self.indented is true print self.munge_text(text) def put_last_line_in_para(self): if self.lastline is not None: if self.para is None: self.para = [] self.para.append(self.lastline) self.lastline = None def flushpara(self): if self.para is not None: print self.munge_text(string.join(self.para, "\n"), para=1) self.para = None def when_blankline(self, _): self.put_last_line_in_para() self.flushpara() def specialline(self, message): self.flushpara() if self.lastline is None: raise message try: return self.lastline finally: self.lastline = None def when_equalses(self, _): print ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n<html><head><title>%s</title></head><body><h1>%s</h1>' % ((self.munge_text(self.specialline("You must have a line of text before a line of ===")),) * 2)) def when_dashes(self, _): print ("<h2>%s</h2>" % self.munge_text(self.specialline("You must have a line of text before a line of ---"))) def when_indent(self, _): self.put_last_line_in_para() self.flushpara() sys.stdout.write("<blockquote><pre>") # no trailing space or newline self.indented = 1 def when_outdent(self, _): print "</pre></blockquote>" self.indented = 0 def when_bullet(self, _): self.put_last_line_in_para() self.flushpara() if self.in_li: print "</li>" print "<li>", self.in_li = 1 self.in_li_start = 1 def when_eof(self, _): self.put_last_line_in_para() self.flushpara() print "</body></html>" realparser = reallyparse() def all(fun, seq): for x in seq: if not fun(x): return 0 return 1 def isspace(x): return x in string.whitespace def findspace(string, start): for ii in xrange(start, len(string)): if isspace(string[ii]): return ii return -1 def line_tokenize(text, fun): """Handle the per-line tokenization tasks. The input language contains some per-line stuff, like headers, and some finer-grained stuff, like \(href a b). This function tokenizes the file into lines and the other tokens that can be discerned on a per-line level. """ indented = 0 for line in text: while line and line[-1] in '\r\n': line = line[:-1] if 1: if all(isspace, line): fun(linetoken('blankline')) elif all(lambda x: x == '-', line) and len(line) > 3: fun(linetoken('dashes')) elif all(lambda x: x == '=', line): fun(linetoken('equalses')) else: if all(isspace, line[:4]): if not indented: indented = 1 fun(linetoken('indent')) else: if indented: indented = 0 fun(linetoken('outdent')) if line[:2] == '- ': fun(linetoken('bullet')) line = line[1:] while line[0] in string.whitespace: line = line[1:] fun(linetoken('line', line)) fun(linetoken('eof')) def main(): f = open(sys.argv[1], "r") line_tokenize(f.readlines(), realparser) f.close() if __name__ == "__main__": main() """ import parsearc; parsearc.line_tokenize(open("/home/kragen/notes/arc").readlines(), parsearc.realparser) Profile results: Fri Nov 30 00:12:21 2001 fooprof 8421 function calls (7894 primitive calls) in 0.840 CPU seconds Ordered by: cumulative time ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 0.840 0.840 <string>:1(?) 1 0.000 0.000 0.840 0.840 profile:0(import parsearc; parsearc.line_tokenize(open("/home/kragen/notes/arc").readlines(), parsearc.realparser)) 1 0.000 0.000 0.840 0.840 parsearc.py:185(line_tokenize) 333/1 0.120 0.000 0.840 0.840 variant.py:55(__call__) 226/31 0.070 0.000 0.770 0.025 parsearc.py:125(when_line) 80 0.090 0.001 0.400 0.005 parsearc.py:49(munge_text) 97 0.000 0.000 0.220 0.002 parsearc.py:137(flushpara) 66 0.010 0.000 0.170 0.003 parsearc.py:141(when_blankline) 452 0.040 0.000 0.170 0.000 defrecord.py:85(__getitem__) 244 0.080 0.000 0.120 0.000 Iterate.py:158(__getitem__) 80 0.050 0.001 0.120 0.002 Iterate.py:322(__init__) 997 0.060 0.000 0.090 0.000 parsearc.py:178(all) 333 0.060 0.000 0.080 0.000 variant.py:40(__init__) 245 0.020 0.000 0.080 0.000 parsearc.py:20(htmlquote) 980 0.060 0.000 0.060 0.000 string.py:361(replace) 12 0.000 0.000 0.050 0.004 parsearc.py:153(when_dashes) 324 0.020 0.000 0.040 0.000 Iterate.py:335(__getnext) 160 0.010 0.000 0.040 0.000 Iterate.py:220(pure_python_iter) 7 0.000 0.000 0.040 0.006 parsearc.py:163(when_bullet) 160 0.030 0.000 0.030 0.000 Iterate.py:186(__init__) 358 0.020 0.000 0.020 0.000 string.py:118(join) 324 0.020 0.000 0.020 0.000 Iterate.py:190(next) 10 0.000 0.000 0.020 0.002 parsearc.py:155(when_indent) 333 0.020 0.000 0.020 0.000 defrecord.py:64(__init__) 244 0.010 0.000 0.020 0.000 Iterate.py:343(next) 698 0.010 0.000 0.010 0.000 parsearc.py:183(isspace) 330 0.010 0.000 0.010 0.000 string.py:161(find) 271 0.010 0.000 0.010 0.000 parsearc.py:203(<lambda>) 1 0.000 0.000 0.010 0.010 parsearc.py:171(when_eof) 295 0.010 0.000 0.010 0.000 parsearc.py:131(put_last_line_in_para) 389 0.010 0.000 0.010 0.000 parsearc.py:201(<lambda>) 10 0.000 0.000 0.000 0.000 parsearc.py:160(when_outdent) 13 0.000 0.000 0.000 0.000 parsearc.py:144(specialline) 105 0.000 0.000 0.000 0.000 Iterate.py:327(<lambda>) 240 0.000 0.000 0.000 0.000 Iterate.py:156(__init__) 1 0.000 0.000 0.000 0.000 parsearc.py:150(when_equalses) 0 0.000 0.000 profile:0(profiler) """