idiosyncratic markup processor

Kragen Sitaker Thu, 17 Jan 2002 00:36:02 -0800

I seem to have a thing for writing "simple" markup languages that get
translated to HTML, because HTML is both too slow to type and too hard
to read for my taste.  I started doing this in 1994 with a bunch of m4
macros called "htm4".  This hideous mess of kludges is the fourth one,
and I like its input language best of any of them so far.  Several
items in the queue to be posted to kragen-tol are in the format this
program accepts.


It uses a couple of modules from the MetaPy package, which is
available from http://pobox.com/~kragen/sw/MetaPy-6.tar.gz and has
been posted to this list.

#!/usr/local/bin/python
# parse this random text format I wrote some article on Arc in.
# format includes:
# blank lines separate paragraphs (ok)
# - signs at the beginnings of lines of text indicate bullets (ok)
# lines of - signs indicate that the previous line is a header (ok)
# a line of = signs indicates the title of the document (ok)
# \(foo bar baz) is a foo tag around 'bar baz', which must have
# balanced parentheses within it..  Tags include ul, href, and i.
# ul and i translate to the corresponding HTML tags; \(href url bar baz)
# translates to <a href="url">bar baz</a>.
# Indents of four or more spaces indicate blockquotes. (ok)

import sys, string, MetaPy.variant, MetaPy.Iterate

linetoken = MetaPy.variant.defvariant(line=["text"], blankline=[],
                                      dashes=[], bullet=[], indent=[],
                                      outdent=[], equalses=[], eof=[])

def htmlquote(s):
    return string.replace(
        string.replace(
        string.replace(
        string.replace(s, "&", "&amp;"),
        '"', "&quot;"),
        "<", "&lt;"),
        ">", "&gt;")

class reallyparse(MetaPy.variant.Patmatch):
    """This state machine transforms a sequence of linetokens into XHTML."""
    def __init__(self):
        self.para = None
        self.lastline = None
        self.indented = 0
        self.in_li = 0
        self.in_li_start = 0
        self.parenstack = []
        self.inpara = 0
    # how to handle paragraphs?  The problem is that what we're passed might
    # be one paragraph, none, or two, if it's declared to be a paragraph
    # at all.
    # <ul></ul> tags in the middle might break up text.
    # If there's no text other than those tags, we don't have a paragraph
    # at all, despite what the other function thought.
    # If there's text before the <ul> or after the </ul>, we have a single
    # paragraph of that stuff, but it won't include the <ul> or </ul>.
    # There can actually be text on both sides of the </ul>.
    # Also, list items get handled specially; the first paragraph in a
    # list item doesn't get <p> tags, but later ones do.
    def munge_text(self, text, para=0):
        """Here's how we handle tags.

        Just before we print text, we run it through this function.
        It keeps track of paren nesting levels more or less
        independent of everything else in the universe, replaces
        opening and closing tags and, where necessary, replaces entire
        tag contents.  (This last clearly won't always work where the
        tag contents cross multiple text chunks, but that would
        produce invalid HTML anyway, because only the href tag does
        that.)

        """
        rv = []
        def rvadd(text, rv=rv):
            if not all(isspace, text):
                rv.append(htmlquote(text))
        # find () quickly, that's all we care about
        lparens = []
        rparens = []
        ii = -1
        while 1:
            ii = string.find(text, '(', ii+1)
            if ii == -1: break
            lparens.append(ii)
        ii = -1
        while 1:
            ii = string.find(text, ')', ii+1)
            if ii == -1: break
            rparens.append(ii)
        parens = MetaPy.Iterate.itermerge(lparens, rparens)
        lasti = 0
        for ii in parens:
            if text[ii] == '(':
                if ii > 0 and text[ii-1] == '\\':
                    # it's a tag
                    rvadd(text[lasti:ii-1])
                    space = findspace(text, ii)
                    if space == -1: space = len(text)
                    tagname = text[ii+1:space]
                    ii = space  # for 'lasti' below
                    if tagname in ('i', 'ul', 'b'):
                        rv.append("<%s>" % tagname)
                        self.parenstack.append(tagname)
                    elif tagname == 'href':
                        space = findspace(text, ii+1)
                        if space == -1: space = len(text)
                        url = text[ii+1:space]
                        ii = space
                        rv.append('<a href="%s">' % htmlquote(url))
                        self.parenstack.append('a')
                    else:
                        raise "Unknown tag", tagname
                else:
                    # it's a left paren
                    rvadd(text[lasti:ii+1])
                    self.parenstack.append('(')
            elif text[ii] == ')':
                rvadd(text[lasti:ii])
                if not self.parenstack:
                    raise "Too many close parens"
                closing = self.parenstack.pop()
                if closing in ('i', 'b', 'a', 'ul'):
                    rv.append("</%s>" % closing)
                elif closing == '(':
                    rv.append(')')
                else:
                    raise "Weird closing %s" % closing
            lasti = ii+1
        rvadd(text[lasti:])
        # now we run through and add paragraph markers if need be
        if para:
            if '<ul>' in rv:
                ulindex = rv.index('<ul>')
                if ulindex != 0:
                    rv.insert(ulindex, '</p>')
                    rv.insert(0, '<p>')
            elif '</ul>' in rv:
                ulindex = rv.index('</ul>')
                if ulindex != len(rv)-1:
                    rv.insert(-1, "</p>")
                    rv.insert(ulindex+1, "<p>")
                if self.in_li:
                    rv.insert(ulindex, "</li>")
                    if not self.in_li_start and ulindex != 0:
                        rv.insert(ulindex, "</p>")
                        rv.insert(0, "<p>")
                    self.in_li = 0
                    self.in_li_start = 0
            elif not self.in_li_start:
                rv.insert(0, "<p>")
                rv.append("</p>")
            else:
                self.in_li_start = 0
        return string.join(rv, '')
    def when_line(self, (text,)):
        if not self.indented:
            self.put_last_line_in_para()
            self.lastline = text
        else: # self.indented is true
            print self.munge_text(text)  
    def put_last_line_in_para(self):
        if self.lastline is not None:
            if self.para is None:
                self.para = []
            self.para.append(self.lastline)
        self.lastline = None
    def flushpara(self):
        if self.para is not None:
            print self.munge_text(string.join(self.para, "\n"), para=1)
            self.para = None
    def when_blankline(self, _):
        self.put_last_line_in_para()
        self.flushpara()
    def specialline(self, message):
        self.flushpara()
        if self.lastline is None:
            raise message
        try: return self.lastline
        finally: self.lastline = None
    def when_equalses(self, _):
        print ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 
Transitional//EN">\n<html><head><title>%s</title></head><body><h1>%s</h1>'
               % ((self.munge_text(self.specialline("You must have a line of text 
before a line of ===")),) * 2))
    def when_dashes(self, _):
        print ("<h2>%s</h2>" % self.munge_text(self.specialline("You must have a line 
of text before a line of ---")))
    def when_indent(self, _):
        self.put_last_line_in_para()
        self.flushpara()
        sys.stdout.write("<blockquote><pre>") # no trailing space or newline
        self.indented = 1
    def when_outdent(self, _):
        print "</pre></blockquote>"
        self.indented = 0
    def when_bullet(self, _):
        self.put_last_line_in_para()
        self.flushpara()
        if self.in_li:
            print "</li>"
        print "<li>",
        self.in_li = 1
        self.in_li_start = 1
    def when_eof(self, _):
        self.put_last_line_in_para()
        self.flushpara()
        print "</body></html>"

realparser = reallyparse()
            
def all(fun, seq):
    for x in seq:
        if not fun(x): return 0
    return 1

def isspace(x): return x in string.whitespace

def findspace(string, start):
    for ii in xrange(start, len(string)):
        if isspace(string[ii]): return ii
    return -1

def line_tokenize(text, fun):
    """Handle the per-line tokenization tasks.

    The input language contains some per-line stuff, like headers, and
    some finer-grained stuff, like \(href a b).  This function
    tokenizes the file into lines and the other tokens that can be
    discerned on a per-line level.

    """
    indented = 0
    for line in text:
        while line and line[-1] in '\r\n': line = line[:-1]
        if 1:
            if all(isspace, line):
                fun(linetoken('blankline'))
            elif all(lambda x: x == '-', line) and len(line) > 3:
                fun(linetoken('dashes'))
            elif all(lambda x: x == '=', line):
                fun(linetoken('equalses'))
            else:
                if all(isspace, line[:4]):
                    if not indented:
                        indented = 1
                        fun(linetoken('indent'))
                else:
                    if indented:
                        indented = 0
                        fun(linetoken('outdent'))
                if line[:2] == '- ':
                    fun(linetoken('bullet'))
                    line = line[1:]
                    while line[0] in string.whitespace:
                        line = line[1:]
                fun(linetoken('line', line))
    fun(linetoken('eof'))

def main():
    f = open(sys.argv[1], "r")
    line_tokenize(f.readlines(), realparser)
    f.close()

if __name__ == "__main__": main()

"""
import parsearc; parsearc.line_tokenize(open("/home/kragen/notes/arc").readlines(), 
parsearc.realparser)

Profile results:
Fri Nov 30 00:12:21 2001    fooprof

         8421 function calls (7894 primitive calls) in 0.840 CPU seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.840    0.840 <string>:1(?)
        1    0.000    0.000    0.840    0.840 profile:0(import parsearc; 
parsearc.line_tokenize(open("/home/kragen/notes/arc").readlines(), 
parsearc.realparser))
        1    0.000    0.000    0.840    0.840 parsearc.py:185(line_tokenize)
    333/1    0.120    0.000    0.840    0.840 variant.py:55(__call__)
   226/31    0.070    0.000    0.770    0.025 parsearc.py:125(when_line)
       80    0.090    0.001    0.400    0.005 parsearc.py:49(munge_text)
       97    0.000    0.000    0.220    0.002 parsearc.py:137(flushpara)
       66    0.010    0.000    0.170    0.003 parsearc.py:141(when_blankline)
      452    0.040    0.000    0.170    0.000 defrecord.py:85(__getitem__)
      244    0.080    0.000    0.120    0.000 Iterate.py:158(__getitem__)
       80    0.050    0.001    0.120    0.002 Iterate.py:322(__init__)
      997    0.060    0.000    0.090    0.000 parsearc.py:178(all)
      333    0.060    0.000    0.080    0.000 variant.py:40(__init__)
      245    0.020    0.000    0.080    0.000 parsearc.py:20(htmlquote)
      980    0.060    0.000    0.060    0.000 string.py:361(replace)
       12    0.000    0.000    0.050    0.004 parsearc.py:153(when_dashes)
      324    0.020    0.000    0.040    0.000 Iterate.py:335(__getnext)
      160    0.010    0.000    0.040    0.000 Iterate.py:220(pure_python_iter)
        7    0.000    0.000    0.040    0.006 parsearc.py:163(when_bullet)
      160    0.030    0.000    0.030    0.000 Iterate.py:186(__init__)
      358    0.020    0.000    0.020    0.000 string.py:118(join)
      324    0.020    0.000    0.020    0.000 Iterate.py:190(next)
       10    0.000    0.000    0.020    0.002 parsearc.py:155(when_indent)
      333    0.020    0.000    0.020    0.000 defrecord.py:64(__init__)
      244    0.010    0.000    0.020    0.000 Iterate.py:343(next)
      698    0.010    0.000    0.010    0.000 parsearc.py:183(isspace)
      330    0.010    0.000    0.010    0.000 string.py:161(find)
      271    0.010    0.000    0.010    0.000 parsearc.py:203(<lambda>)
        1    0.000    0.000    0.010    0.010 parsearc.py:171(when_eof)
      295    0.010    0.000    0.010    0.000 parsearc.py:131(put_last_line_in_para)
      389    0.010    0.000    0.010    0.000 parsearc.py:201(<lambda>)
       10    0.000    0.000    0.000    0.000 parsearc.py:160(when_outdent)
       13    0.000    0.000    0.000    0.000 parsearc.py:144(specialline)
      105    0.000    0.000    0.000    0.000 Iterate.py:327(<lambda>)
      240    0.000    0.000    0.000    0.000 Iterate.py:156(__init__)
        1    0.000    0.000    0.000    0.000 parsearc.py:150(when_equalses)
        0    0.000             0.000          profile:0(profiler)
"""

idiosyncratic markup processor

Reply via email to