Test code for SAX Content Handler. The output byte array is correct. Still need to test and clean up code. Check point.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/df6772cf Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/df6772cf Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/df6772cf Branch: refs/heads/prestonc/parser Commit: df6772cf02a41a7d87c012c6e37775f4dce6158b Parents: a76d647 Author: Preston Carman <[email protected]> Authored: Fri Feb 14 23:25:05 2014 -0800 Committer: Preston Carman <[email protected]> Committed: Thu Feb 27 14:22:24 2014 -0800 ---------------------------------------------------------------------- .../vxquery/xmlparser/SAXContentHandler.java | 270 +++++++++++++++++++ 1 file changed, 270 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/df6772cf/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java index a8ec0b9..0ef2991 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java @@ -30,6 +30,7 @@ import org.apache.vxquery.datamodel.builders.nodes.PINodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.TextNodeBuilder; import org.apache.vxquery.datamodel.values.ValueTag; import org.apache.vxquery.types.BuiltinTypeQNames; +import org.apache.vxquery.util.GrowableIntArray; import org.apache.vxquery.xmlquery.query.XQueryConstants; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -37,6 +38,8 @@ import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.ext.LexicalHandler; +import edu.uci.ics.hyracks.data.std.primitive.BytePointable; +import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable; import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage; public class SAXContentHandler implements ContentHandler, LexicalHandler { @@ -72,6 +75,35 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private boolean pendingText; + private final ArrayBackedValueStorage leavesABVS; + + // Structure and data. + private final GrowableIntArray leavesKind; + private final GrowableIntArray leavesStart; + private final GrowableIntArray leavesEnd; + private final GrowableIntArray leavesDepth; + private final GrowableIntArray leavesParent; + private final GrowableIntArray leavesAttributeCount; + private final GrowableIntArray leavesChildrenCount; + + // Data keys for current progress through XML document. + private int textCount = 0; + private int textCurrentDepth = 0; + private int childSlotCounter = 0; + private final GrowableIntArray previousLeaf; + private final GrowableIntArray childStartOffset; + private final GrowableIntArray childSlotOffset; + private final int SLOT_SIZE = 4; + + private final int LEAF_TEXT = 1; + private final int LEAF_PRE_NODE = 2; + private final int LEAF_COMMENT = 3; + private final int LEAF_ATTRIBUTE = 4; + private final int LEAF_PI = 5; + private final int LEAF_PRE_DOCUMENT = 5; + private final int LEAF_POST_DOCUMENT = 6; + private final int LEAF_POST_NODE = 7; + public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider) { docABVS = new ArrayBackedValueStorage(); this.createNodeIds = nodeIdProvider != null; @@ -88,6 +120,21 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { enbStack = new ArrayList<ElementNodeBuilder>(); freeENBList = new ArrayList<ElementNodeBuilder>(); pendingText = false; + + leavesKind = new GrowableIntArray(); + leavesStart = new GrowableIntArray(); + leavesABVS = new ArrayBackedValueStorage(); + leavesEnd = new GrowableIntArray(); + leavesDepth = new GrowableIntArray(); + leavesParent = new GrowableIntArray(); + leavesAttributeCount = new GrowableIntArray(); + leavesChildrenCount = new GrowableIntArray(); + previousLeaf = new GrowableIntArray(); + childStartOffset = new GrowableIntArray(); + childSlotOffset = new GrowableIntArray(); + textCount = 0; + textCurrentDepth = 0; + } @Override @@ -106,6 +153,23 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { e.printStackTrace(); throw new SAXException(e); } + + leafNodeStart(LEAF_POST_DOCUMENT); + leafNodeEnd(); + + textCurrentDepth--; + int[] k = leavesKind.getArray(); + int[] s = leavesStart.getArray(); + int[] e = leavesEnd.getArray(); + int[] d = leavesDepth.getArray(); + int[] p = leavesParent.getArray(); + int[] c = leavesChildrenCount.getArray(); + int[] a = leavesAttributeCount.getArray(); + for (int i = 0; i < s.length; ++i) { + System.err.println(i + " " + k[i] + " - " + d[i] + ":" + s[i] + ":" + e[i] + " p=" + p[i] + " a=" + a[i] + + " c=" + c[i]); + } + } @Override @@ -120,6 +184,12 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { e.printStackTrace(); throw new SAXException(e); } + + leafNodeStart(LEAF_POST_NODE); + leafNodeEnd(); + + textCurrentDepth--; + } @Override @@ -145,6 +215,17 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { tempABVS.getDataOutput().writeUTF(data); pinb.setContent(tempABVS); endChildInParent(pinb); + + // Save to leavesABVS + leafNodeStart(LEAF_PI); + pinb.reset(leavesABVS); + if (createNodeIds) { + pinb.setLocalNodeId(nodeIdCounter); + } + pinb.setTarget(target); + pinb.setContent(data); + leafNodeEnd(); + } catch (IOException e) { e.printStackTrace(); throw new SAXException(e); @@ -161,7 +242,9 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startDocument() throws SAXException { + textCurrentDepth++; try { + nodeIdCounter = 0; db.reset(); docABVS.reset(); docb.reset(docABVS); @@ -169,6 +252,15 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { docb.setLocalNodeId(nodeIdCounter++); } docb.startChildrenChunk(); + + leafNodeStart(LEAF_PRE_DOCUMENT); + DocumentNodeBuilder docb2 = new DocumentNodeBuilder(); + docb2.reset(leavesABVS); + if (createNodeIds) { + docb2.setLocalNodeId(nodeIdCounter); + } + leafNodeEnd(); + flushText(); } catch (IOException e) { e.printStackTrace(); @@ -178,6 +270,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { + try { flushText(); int idx = name.indexOf(':'); @@ -197,6 +290,24 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { if (createNodeIds) { enb.setLocalNodeId(nodeIdCounter++); } + + // Save to leavesABVS + leafNodeStart(LEAF_PRE_NODE); + ElementNodeBuilder enb2 = createENB(); + enb2.setMvs(leavesABVS); + enb2.setName(uriCode, localNameCode, prefixCode); + if (attachTypes) { + int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); + int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_STR); + int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX); + enb2.setType(typeUriCode, typeLocalNameCode, typePrefixCode); + } + if (createNodeIds) { + enb2.setLocalNodeId(nodeIdCounter); + } + leafNodeEnd(); + textCurrentDepth++; + enb.startAttributeChunk(); final int nAttrs = atts.getLength(); for (int i = 0; i < nAttrs; ++i) { @@ -223,6 +334,23 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } anb.setValue(tempABVS); enb.endAttribute(anb); + + // Save to leavesABVS + leafNodeStart(LEAF_ATTRIBUTE); + anb.reset(leavesABVS); + anb.setName(aUriCode, aLocalNameCode, aPrefixCode); + if (attachTypes) { + int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); + int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_ATOMIC_STR); + int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX); + anb.setType(typeUriCode, typeLocalNameCode, typePrefixCode); + } + if (createNodeIds) { + anb.setLocalNodeId(nodeIdCounter++); + } + anb.setValue(atts.getValue(i)); + leafNodeEnd(); + } enb.endAttributeChunk(); enb.startChildrenChunk(); @@ -250,6 +378,16 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } cnb.setValue(tempABVS); endChildInParent(cnb); + + // Save to leavesABVS + leafNodeStart(LEAF_COMMENT); + cnb.reset(leavesABVS); + if (createNodeIds) { + cnb.setLocalNodeId(nodeIdCounter); + } + cnb.setValue(buffer.toString()); + leafNodeEnd(); + buffer.delete(0, buffer.length()); } catch (IOException e) { e.printStackTrace(); @@ -267,6 +405,16 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } tnb.setValue(tempABVS); peekENBStackTop().endChild(tnb); + + // Save to leavesABVS + leafNodeStart(LEAF_TEXT); + tnb.reset(leavesABVS); + if (createNodeIds) { + tnb.setLocalNodeId(nodeIdCounter); + } + tnb.setValue(buffer.toString()); + leafNodeEnd(); + buffer.delete(0, buffer.length()); pendingText = false; } @@ -314,6 +462,100 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { out.write(docABVS.getByteArray(), docABVS.getStartOffset(), docABVS.getLength()); } + public void writeOnce(ArrayBackedValueStorage abvs) throws IOException { + DataOutput out = abvs.getDataOutput(); + out.write(ValueTag.NODE_TREE_TAG); + byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK; + if (attachTypes) { + header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK; + } + if (createNodeIds) { + header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK; + } + out.write(header); + if (createNodeIds) { + out.writeInt(nodeIdProvider.getId()); + } + db.write(abvs); + + for (int i = 0; i < leavesKind.getSize(); ++i) { + if (leavesKind.getArray()[i] == LEAF_PRE_DOCUMENT) { + out.write(leavesABVS.getByteArray(), leavesStart.getArray()[i], + leavesEnd.getArray()[i] - leavesStart.getArray()[i]); + + int children = leavesChildrenCount.getArray()[i]; + if (children > 0) { + out.writeInt(children); + int offset = abvs.getLength(); + for (int s = 0; s < children; ++s) { + out.writeInt(-1); + addChildSlot(offset, s, children); + } + } + // Continue with nodes. + + } else if (leavesKind.getArray()[i] == LEAF_PRE_NODE) { + int nsCount = 0; + int attrCount = leavesAttributeCount.getArray()[i]; + int childrenCount = leavesChildrenCount.getArray()[i]; + + ElementNodeBuilder enb2 = createENB(); + enb2.setMvs(abvs); + enb2.setTagHeader(nsCount, attrCount, childrenCount); + + out.write(leavesABVS.getByteArray(), leavesStart.getArray()[i], + leavesEnd.getArray()[i] - leavesStart.getArray()[i]); + + if (attrCount > 0) { + out.writeInt(attrCount); + int offset = abvs.getLength(); + for (int s = 0; s < attrCount; ++s) { + out.writeInt(-1); + addChildSlot(offset, s, attrCount); + } + for (int s = 0; s < attrCount; ++s) { + ++i; + out.write(leavesABVS.getByteArray(), leavesStart.getArray()[i], leavesEnd.getArray()[i] + - leavesStart.getArray()[i]); + } + } + + if (childrenCount > 0) { + out.writeInt(childrenCount); + int offset = abvs.getLength(); + for (int s = 0; s < childrenCount; ++s) { + out.writeInt(-1); + addChildSlot(offset, s, childrenCount); + } + } + // Continue with nodes. + + } else { + if (leavesKind.getArray()[i] != LEAF_POST_DOCUMENT && leavesKind.getArray()[i] != LEAF_POST_NODE) + out.write(leavesABVS.getByteArray(), leavesStart.getArray()[i], + leavesEnd.getArray()[i] - leavesStart.getArray()[i]); + finishChildSlot(abvs); + } + } + } + + private void addChildSlot(int offset, int count, int total) { + childStartOffset.insert(childSlotCounter, offset + total * SLOT_SIZE); + childSlotOffset.insert(childSlotCounter, offset + (total - count - 1) * SLOT_SIZE); + childSlotCounter++; + + for (int i = 0; i < childSlotCounter; ++i) { + System.err.println("\t\t" + i + " " + childStartOffset.getArray()[i] + " - " + + childSlotOffset.getArray()[i]); + } + } + + private void finishChildSlot(ArrayBackedValueStorage abvs) { + childSlotCounter--; + int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter]; + IntegerPointable.setInteger(abvs.getByteArray(), childSlotOffset.getArray()[childSlotCounter], length); + } + private ElementNodeBuilder createENB() { if (freeENBList.isEmpty()) { return new ElementNodeBuilder(); @@ -344,4 +586,32 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { peekENBStackTop().endChild(anb); } } + + private void leafNodeStart(int kind) { + leavesKind.append(kind); + leavesStart.append(leavesABVS.getLength()); + leavesDepth.append(textCurrentDepth); + leavesAttributeCount.append(0); + leavesChildrenCount.append(0); + + int parent = previousLeaf.getArray()[textCurrentDepth - 1]; + leavesParent.append(parent); + if (kind == LEAF_POST_NODE || kind == LEAF_POST_DOCUMENT) { + // Skip Count + } else if (kind == LEAF_ATTRIBUTE) { + leavesAttributeCount.getArray()[parent] += 1; + } else if (textCount != parent) { + leavesChildrenCount.getArray()[parent] += 1; + } + // If it can have children. + if (kind == LEAF_PRE_NODE || kind == LEAF_PRE_DOCUMENT) { + previousLeaf.getArray()[textCurrentDepth] = textCount; + } + } + + private void leafNodeEnd() { + leavesEnd.append(leavesABVS.getLength()); + textCount++; + } + } \ No newline at end of file
