##############################################################################
#
# reqallxml.awk - A simple awk XML parser to grab Reqall items to org-mode
#
# Author:	Brad Bozarth <prettygood@cs.stanford.edu>
#
# Synopsis:
#
#	awk -f reqallxml.awk [FILESPEC]...
#
# Description:
#
#	This script is a simple XML parser for (modern variants of) awk.
#	Items from a Reqall RSS feed are cached locally, and new items
#	are added to a org-mode file. Many things are currently 
#	hardcoded - modify to suit!
#
#	Created by adding bits to xmlparser.awk by Steve Coile.
###############################################################################

BEGIN {

# In XML, literal "<" and ">" are only valid as tag delimiters;
# to include a "<" or ">" as data, they must be quoted: "&lt;" and
# "&gt;".  So we know that if we encounter a ">", we have reached the
# end of a tag.  This makes a convenient end-of-record marker, as the
# end-of-tag delimiter marks a special event, whereas a new-line is
# simply whitespace in XML.

	RS = ">";

	lineno = 1;
	sptr = 0;

}

# Count input lines.

{
	data = $0;
	lineno += gsub( /\n/, "", data );
	data = "";
}

# Special modes of operation.  These handle special XML sections, such
# as literal character data containing XML meta-characters ("cdata"
# sections), comments, and processing instructions ("pi") for other
# document processors.

# "Cdata" sections are teminated by the sequence, "]]>".

( mode == "cdata" ) {
	if ( $0 ~ /\]\]$/ ) {
		sub( /\]\]$/, "", $0 );
		mode = "";
	};
	item[idx] = item[idx] RS $0;
	next;
}

# Comment sections are terminated by the sequence, "-->".

( mode == "comment" ) {
	if ( $0 ~ /--$/ ) {
		sub( /--$/, "", $0 );
		mode = "";
	};
	item[idx] = item[idx] RS $0;
	next;
}

# Processing instruction sections are terminated by the sequence, "?>".

( mode == "pi" ) {
	if ( $0 ~ /\?$/ ) {
		sub( /\?$/, "", $0 );
		mode = "";
	};
	item[idx] = item[idx] RS $0;
	next;
}

( !mode ) {

	mline = 0;

# Our record separator is the end-of-tag marker, ">".  If we've
# encountered an end-of-tag marker, we should have a beginning-of-tag
# marker ("<") somewhere in the input record.  If not, either there
# is a spurious end-of-tag marker, or the record was terminated by
# the end-of-file.

	p = index( $0, "<" );

# Any data preceeding the beginning-of-tag marker is raw data.  If no
# beginning-of-tag marker is present, everything in the input is data.

	if ( !p || ( p > 1 )) {
		idx += 1;
		type[idx] = "data";
		item[idx] = ( p ? substr( $0, 1, ( p - 1 )) : $0 );
		if ( !p ) next;
		$0 = substr( $0, p );
	};

# Recognize special XML sections.  Sections are not processed as XML,
# but handled specially.  If the section end with the current input
# record, we continue processing XML in the next record; otherwise,
# we enter a special mode and perform special processing.

# Character data ("cdata") sections contain literal character data
# containing XML meta-characters that should not be processed. Character
# data sections begin with the sequence, "<![CDATA[" and end with "]]>".
# This section may span input records.

	if ( $0 ~ /^<!\[[Cc][Dd][Aa][Tt][Aa]\[/ ) {
		idx += 1;
		type[idx] = "cdata";
		$0 = substr( $0, 10 );
		if ( $0 ~ /\]\]$/ ) sub( /\]\]$/, "", $0 );
		else {
			mode = "cdata";
			mline = lineno;
		};
		item[idx] = $0;
		next;
	}

# Comments begin with the sequence, "<!--" and end with "-->".
# This section may span input records.

	else if ( $0 ~ /^<!--/ ) {
		idx += 1;
		type[idx] = "comment";
		$0 = substr( $0, 5 );
		if ( $0 ~ /--$/ ) sub( /--$/, "", $0 );
		else {
			mode = "comment";
			mline = lineno;
		};
		item[idx] = $0;
		next;
	}

# Declarations begin with the sequence, "<!" and end with ">".
# This section may *NOT* span input records.

	else if ( $0 ~ /^<!/ ) {
		idx += 1;
		type[idx] = "decl";
		$0 = substr( $0, 3 );
		item[idx] = $0;
		next;
	}

# Processing instructions ("pi") begin with the sequence, "<?" and end
# with "?>".  This section may span input records.

	else if ( $0 ~ /^<\?/ ) {
		idx += 1;
		type[idx] = "pi";
		$0 = substr( $0, 3 );
		if ( $0 ~ /\?$/ ) sub( /\?$/, "", $0 );
		else {
			mode = "pi";
			mline = lineno;
		};
		item[idx] = $0;
		next;
	};

# Beyond this point, we're dealing strictly with a tag.

	idx += 1;

# A tag that begins with "</" (e.g. as in "</p>") is a close tag:
# it closes a tag-enclosed block.

	if ( substr( $0, 1, 2 ) == "</" ) {
		type[idx] = "end";
		tag = $0 = substr( $0, 3 );
	}

# A tag that begins simply with "<" (e.g. as in "<p>") is an open
# tag: it starts a tag-enclosed block.  Note that a stand-alone tag
# (e.g. "<data/>") will be handled later, and will appear as an open
# tag and close tag, with no data between.

	else {
		type[idx] = "begin";
		tag = $0 = substr( $0, 2 );
	};

# The tag name is saved in "tag" so that we can retreive it later should
# we find that the tag is stand-alone and need to save a close tag item.

	sub( /[ \n\t\/].*$/, "", tag );
	tag = toupper( tolower( tag ));
	item[idx] = tag;

# Validate the tag name.  If invalid, indicate so and exit.

	if ( tag !~ /^[A-Za-z][-+_.:0-9A-Za-z]*$/ )
	{
		type[idx] = "error";
		item[idx] = "line " lineno ": " tag ": invalid tag name";
		exit( 1 );
	}

# If an open tag is encountered, its name is recorded on the stack.
# If a close tag is encountered, its name is compared against the name
# on the top of the stack.  If the names differ, an error is generated
# (XML does not allow overlapping tags).

	if ( type[idx] == "begin" ) {
		sptr += 1;
		lstack[sptr] = lineno;
		tstack[sptr] = tag;
	}
	else if ( type[idx] == "end" ) {
		if ( tag != tstack[sptr] ) {
			type[idx] = "error";
			item[idx] = "line " lineno ": " tag \
					": unexpected close tag, expecting " \
					tstack[sptr];
			exit( 1 );
		};
		delete tstack[sptr];
		sptr -= 1;
	};

	sub( /[^ \n\t\/]*[ \n\t]*/, "", $0 );

# Beyond this point, we're dealing with the tag attributes, if any,
# and/or the stand-alone end-of-tag marker.

	while ( $0 ) {

# If $0 contains only a slash (/), then the tag we're processing is
# stand-alone (e.g. "<data/>"), so we generate a close tag, but no data
# between the open and close tags.

		if ( $0 == "/" )
		{
			idx += 1;
			type[idx] = "end";
			item[idx] = tag;
			delete lstack[sptr];
			delete tstack[sptr];
			sptr -= 1;
			break;
		};

# The attribute name is determined.  Note that the attribute name is also
# saved to "attrib" so that we can reference it should the attribute
# not include a value.  If the attribute does not include a value,
# it's name is given as its value.

		idx += 1;
		type[idx] = "attrib";
		attrib = $0;
		sub( /=.*$/, "", attrib );
		attrib = tolower( attrib );

		item[idx] = attrib;

# Validate the attribute name.  If invalid, indicate so and exit.

		if ( attrib !~ /^[A-Za-z][-+_0-9A-Za-z]*(:[A-Za-z][-+_0-9A-Za-z]*)?$/ )
		{
			type[idx] = "error";
			item[idx] = "line " lineno ": " attrib \
					" : invalid attribute name";
			exit( 1 );
		}

		sub( /^[^=]*/, "", $0 );

# Each attribute must have a value.  If one isn't explicit in the input,
# we assign it one equal to the name of the attribute itself.  Attribute
# values in the input may be in one of three forms: enclosed in double
# quotes ("), enclosed in single quotes/apostrophes ('), or a single
# word.

		idx += 1;
		type[idx] = "value";

		if ( substr( $0, 1, 1 ) == "=" ) {
			if ( substr( $0, 2, 1 ) == "\"" ) {
				item[idx] = substr( $0, 3 );
				sub( /".*$/, "", item[idx] );
				sub( /^="[^"]*"/, "", $0 );
			}
			else if ( substr( $0, 2, 1 ) == "'" ) {
				item[idx] = substr( $0, 3 );
				sub( /'.*$/, "", item[idx] );
				sub( /^='[^']*'/, "", $0 );
			}
			else {
				item[idx] = $0;
				sub( /[ \n\t\/]*.$/, "", item[idx] );
				sub( /^=[^ \n\t\/]*/, "", $0 );
			};
		}
		else item[idx] = attrib;

		sub( /^[ \n\t]*/, "", $0 );

	};

	attrib = "";
	tag = "";
	next;

}

END {

# If mode is defined, the input stream ended without terminating an
# XML section.  Thus, the input contains invalid XML.

	if ( mode ) {
		idx += 1;
		type[idx] = "error";
		if ( mode == "cdata" ) mode = "character data";
		else if ( mode == "pi" ) mode = "processing instruction";
		item[idx] = "line " mline ": unterminated " mode;
	};

# If an open tag occured with no corresponding close tag, we have
# invalid XML.

	for ( n = sptr; n; n -= 1 ) {
		idx += 1;
		type[idx] = "error";
		item[idx] = "line " lstack[n] ": " \
				tstack[n] ": unclosed tag";
	};

}

# The following simple examples demonstrate the use of the accumulated
# data from the XML input stream.

END {

# If errors occured, generate appropriate messages and exit without
# further processing.

	if ( type[idx] == "error" ) {
		for ( n = idx; n && ( type[n] == "error" ); n -= 1 );
		for ( n += 1; n <= idx; n += 1 ) print "ERROR:", item[n];
		exit 1;
	};

	itemNum = 0;
	attrName = "";
	validAttr = 0;
	inItem = 0;
	guid = 0;
	attrs[0] = "TITLE";
	attrs[1] = "GUID";
	attrs[2] = "CATEGORY";
	attrs[3] = "DESCRIPTION";
	attrs[4] = "PUBDATE";
	for ( n = 1; n <= idx; n += 1 ) {
	    if ( type[n] == "attrib" ) 
		/* nothing for now */;
	    else if ( type[n] == "begin" ) {
		if ( item[n] == "ITEM" )
		    inItem = 1;
		if ( item[n] == "GUID" && inItem == 1 ) {
		    hasGuid = 1;
		}
		validAttr = 0;
		attrName = item[n];
		for ( attr in attrs ) {
		    if ( item[n] == attrs[attr] && inItem == 1 ) {
			validAttr = 1;
		    }
		}
	    }
	    else if ( type[n] == "end" ) {
		if ( item[n] == "ITEM" ) {
		    if ( hasGuid == 1) {
			itemNum++;
			for ( attr in attrs ) {
			    blobs[guid, attrs[attr]] = tmpBlob[attrs[attr]];
			}
		    }
		    inItem = 0;
		    hasGuid = 0;
		    validAttr = 0;
		}
		for ( attr in attrs ) {
		    if ( item[n] == attrs[attr] ) {
			validAttr = 0;
		    }
		}
	    }
	    else if ( type[n] == "error" ) print "ERROR:", item[n];
	    else { 
		if ( validAttr == 1 ) {
		    if ( type[n] == "data") {
			tmpBlob[attrName] = item[n];
			    if ( attrName == "GUID" ) {
				guids[item[n]] = item[n];
				guid = item[n];
			    }
			}
		    }
		}
	};

	newItems = 0;
	for ( guid in guids ) {
	    ret = system("egrep \"^" guid "\" ~/repo/bin/reqalldb > /dev/null");
	    if ( ret != 0 ) {
		newItems++;
		print guid >> "/Users/Brad/repo/bin/reqalldb";

		for ( attr in attrs )
		    print "   " blobs[guid, attrs[attr]] >> "/Users/Brad/repo/bin/reqalldb"

		system("/usr/bin/awk -f /Users/Brad/repo/bin/add_reqall.awk inbox=Reqall captured=\"" 
		       blobs[guid, "DESCRIPTION"] 
		       "\\n   :LOGBOOK:\\n   - Recorded: [" 
		       blobs[guid, "PUBDATE"] 
		       "]\\n   :END:\" /Users/Brad/Dev/reqall/Brad/repo/org/gtd.org > /tmp/gtd");
		system("mv /tmp/gtd /Users/Brad/Dev/reqall/Brad/repo/org/gtd.org");
	    }
	}
	print "\nProcessed " itemNum " reqall items from feed."
	print "Added " newItems " new items to local DB."
	"date" | getline dateVal;
	print dateVal;

}
