Tracking both old and new parsers.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/278c0db4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/278c0db4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/278c0db4 Branch: refs/heads/prestonc/parser Commit: 278c0db437a41b2feb53fc133969457ee4dd0e17 Parents: b3aee30 Author: Preston Carman <[email protected]> Authored: Wed Feb 19 15:28:23 2014 -0800 Committer: Preston Carman <[email protected]> Committed: Thu Feb 27 14:24:55 2014 -0800 ---------------------------------------------------------------------- .../vxquery/xmlparser/SAXContentHandler.java | 232 +++++++++++++++++-- 1 file changed, 214 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/278c0db4/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java index 40a35b0..2b3d613 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java @@ -16,8 +16,11 @@ package org.apache.vxquery.xmlparser; import java.io.DataOutput; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import org.apache.vxquery.datamodel.accessors.nodes.NodeTreePointable; +import org.apache.vxquery.datamodel.builders.nodes.AbstractNodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.AttributeNodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.CommentNodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.DictionaryBuilder; @@ -39,6 +42,7 @@ import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable; import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage; public class SAXContentHandler implements ContentHandler, LexicalHandler { + private final ArrayBackedValueStorage docABVS; private final boolean createNodeIds; @@ -46,6 +50,8 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final ITreeNodeIdProvider nodeIdProvider; + private final ArrayBackedValueStorage tempABVS; + private final DocumentNodeBuilder docb; private final TextNodeBuilder tnb; @@ -60,7 +66,13 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final StringBuilder buffer; + private final List<ElementNodeBuilder> enbStack; + + private final List<ElementNodeBuilder> freeENBList; + private int nodeIdCounter; + private int copyOldCounter = 0; + private int copyNewCounter = 0; private boolean pendingText; @@ -70,7 +82,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { // Structure and data. private final GrowableIntArray leavesKind; -// private final GrowableIntArray leavesStart; + // private final GrowableIntArray leavesStart; private final GrowableIntArray leavesEnd; // private final GrowableIntArray leavesDepth; // private final GrowableIntArray leavesParent; @@ -96,17 +108,21 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final int LEAF_POST_NODE = 7; public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider) { + docABVS = new ArrayBackedValueStorage(); this.createNodeIds = nodeIdProvider != null; this.attachTypes = attachTypes; this.nodeIdProvider = nodeIdProvider; - enb = new ElementNodeBuilder(); + this.tempABVS = new ArrayBackedValueStorage(); docb = new DocumentNodeBuilder(); tnb = new TextNodeBuilder(); cnb = new CommentNodeBuilder(); pinb = new PINodeBuilder(); + enb = new ElementNodeBuilder(); anb = new AttributeNodeBuilder(); db = new DictionaryBuilder(); buffer = new StringBuilder(); + enbStack = new ArrayList<ElementNodeBuilder>(); + freeENBList = new ArrayList<ElementNodeBuilder>(); pendingText = false; leavesKind = new GrowableIntArray(600); @@ -136,6 +152,10 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void endDocument() throws SAXException { try { flushText(); + docb.endChildrenChunk(); + copyOldCounter++; + docb.finish(); + leafNodeStart(LEAF_POST_DOCUMENT); leafNodeEnd(); textCurrentDepth--; @@ -155,13 +175,18 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { // System.err.println(i + " " + k[i] + " - " + d[i] + ":" + s[i] + ":" + e[i] + " p=" + p[i] + " a=" + a[i] // + " c=" + c[i]); // } - } @Override public void endElement(String uri, String localName, String name) throws SAXException { try { flushText(); + ElementNodeBuilder enb = enbStack.remove(enbStack.size() - 1); + enb.endChildrenChunk(); + copyOldCounter++; + endChildInParent(enb); + freeENB(enb); + leafNodeStart(LEAF_POST_NODE); leafNodeEnd(); textCurrentDepth--; @@ -183,6 +208,20 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void processingInstruction(String target, String data) throws SAXException { try { flushText(); + startChildInParent(pinb); + tempABVS.reset(); + tempABVS.getDataOutput().writeUTF(target); + if (createNodeIds) { + pinb.setLocalNodeId(nodeIdCounter); + } + pinb.setTarget(tempABVS); + copyOldCounter++; + tempABVS.reset(); + tempABVS.getDataOutput().writeUTF(data); + pinb.setContent(tempABVS); + copyOldCounter++; + endChildInParent(pinb); + // Save to leavesABVS leafNodeStart(LEAF_PI); pinb.reset(leavesABVS); @@ -208,6 +247,8 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startDocument() throws SAXException { + copyOldCounter = 0; + copyNewCounter = 0; leavesKind.clear(); //leavesStart.clear(); leavesABVS.reset(); @@ -227,11 +268,18 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { try { nodeIdCounter = 0; db.reset(); + docABVS.reset(); + docb.reset(docABVS); + if (createNodeIds) { + docb.setLocalNodeId(nodeIdCounter++); + } + docb.startChildrenChunk(); leafNodeStart(LEAF_PRE_DOCUMENT); - docb.reset(leavesABVS); + DocumentNodeBuilder docb2 = new DocumentNodeBuilder(); + docb2.reset(leavesABVS); if (createNodeIds) { - docb.setLocalNodeId(nodeIdCounter++); + docb2.setLocalNodeId(nodeIdCounter); } leafNodeEnd(); @@ -249,13 +297,11 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { flushText(); int idx = name.indexOf(':'); String prefix = idx < 0 ? "" : name.substring(0, idx); + ElementNodeBuilder enb = createENB(); + startChildInParent(enb); int uriCode = db.lookup(uri); int localNameCode = db.lookup(localName); int prefixCode = db.lookup(prefix); - - // Save to leavesABVS - leafNodeStart(LEAF_PRE_NODE); - enb.setMvs(leavesABVS); enb.setName(uriCode, localNameCode, prefixCode); if (attachTypes) { int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); @@ -266,9 +312,25 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { if (createNodeIds) { enb.setLocalNodeId(nodeIdCounter++); } + + // Save to leavesABVS + leafNodeStart(LEAF_PRE_NODE); + ElementNodeBuilder enb2 = createENB(); + enb2.setMvs(leavesABVS); + enb2.setName(uriCode, localNameCode, prefixCode); + if (attachTypes) { + int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); + int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_STR); + int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX); + enb2.setType(typeUriCode, typeLocalNameCode, typePrefixCode); + } + if (createNodeIds) { + enb2.setLocalNodeId(nodeIdCounter); + } leafNodeEnd(); textCurrentDepth++; + enb.startAttributeChunk(); final int nAttrs = atts.getLength(); for (int i = 0; i < nAttrs; ++i) { String aName = atts.getQName(i); @@ -276,6 +338,25 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { int aPrefixCode = db.lookup(aIdx < 0 ? "" : aName.substring(0, aIdx)); int aLocalNameCode = db.lookup(atts.getLocalName(i)); int aUriCode = db.lookup(atts.getURI(i)); + String aValue = atts.getValue(i); + tempABVS.reset(); + DataOutput tempOut = tempABVS.getDataOutput(); + tempOut.write(ValueTag.XS_UNTYPED_ATOMIC_TAG); + tempOut.writeUTF(aValue); + enb.startAttribute(anb); + anb.setName(aUriCode, aLocalNameCode, aPrefixCode); + if (attachTypes) { + int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); + int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_ATOMIC_STR); + int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX); + anb.setType(typeUriCode, typeLocalNameCode, typePrefixCode); + } + if (createNodeIds) { + anb.setLocalNodeId(nodeIdCounter++); + } + anb.setValue(tempABVS); + copyOldCounter++; + enb.endAttribute(anb); // Save to leavesABVS leafNodeStart(LEAF_ATTRIBUTE); @@ -292,7 +373,12 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } anb.setValue(atts.getValue(i)); leafNodeEnd(); + } + enb.endAttributeChunk(); + copyOldCounter++; + enb.startChildrenChunk(); + enbStack.add(enb); } catch (IOException e) { e.printStackTrace(); throw new SAXException(e); @@ -307,7 +393,16 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void comment(char[] ch, int start, int length) throws SAXException { try { flushText(); + startChildInParent(cnb); buffer.append(ch, start, length); + tempABVS.reset(); + tempABVS.getDataOutput().writeUTF(buffer.toString()); + if (createNodeIds) { + cnb.setLocalNodeId(nodeIdCounter); + } + cnb.setValue(tempABVS); + copyOldCounter++; + endChildInParent(cnb); // Save to leavesABVS leafNodeStart(LEAF_COMMENT); @@ -327,6 +422,16 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private void flushText() throws IOException { if (pendingText) { + peekENBStackTop().startChild(tnb); + tempABVS.reset(); + tempABVS.getDataOutput().writeUTF(buffer.toString()); + if (createNodeIds) { + tnb.setLocalNodeId(nodeIdCounter); + } + tnb.setValue(tempABVS); + copyOldCounter++; + peekENBStackTop().endChild(tnb); + // Save to leavesABVS leafNodeStart(LEAF_TEXT); tnb.reset(leavesABVS); @@ -380,20 +485,51 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { out.writeInt(nodeIdProvider.getId()); } db.write(abvs); + out.write(docABVS.getByteArray(), docABVS.getStartOffset(), docABVS.getLength()); + copyOldCounter++; + System.err.println("copyCounter: " + copyOldCounter); + } + + int currentOffset = 0; + + public void writeOnce(ArrayBackedValueStorage abvs) throws IOException { + DataOutput out = abvs.getDataOutput(); + out.write(ValueTag.NODE_TREE_TAG); + byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK; + if (attachTypes) { + header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK; + } + if (createNodeIds) { + header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK; + } + out.write(header); + if (createNodeIds) { + out.writeInt(nodeIdProvider.getId()); + } + db.write(abvs); + + copyNewCounter++; // leavesStart.getArray()[i] - int currentOffset = 0; for (int i = 0; i < leavesKind.getSize(); ++i) { if (leavesKind.getArray()[i] == LEAF_PRE_DOCUMENT) { - out.write(leavesABVS.getByteArray(), currentOffset, leavesEnd.getArray()[i] - currentOffset); + flushLeaveNodesUpTo(out, i); + + // for (int x = currentOffset; x < leavesEnd.getArray()[i]; ++x) { + // System.err.println(i + "\t" + leavesKind.getArray()[i] + "\t" + leavesABVS.getByteArray()[x]); + // } int children = leavesChildrenCount.getArray()[i]; + System.err.println("children " + children); if (children > 0) { sequenceSlotStub(abvs, children); } // Continue with nodes. + childrenLength = 0; } else if (leavesKind.getArray()[i] == LEAF_PRE_NODE) { + flushLeaveNodesUpTo(out, i - 1); + int nsCount = 0; int attrCount = leavesAttributeCount.getArray()[i]; int childrenCount = leavesChildrenCount.getArray()[i]; @@ -401,37 +537,61 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { enb.setMvs(abvs); enb.setTagHeader(nsCount, attrCount, childrenCount); - out.write(leavesABVS.getByteArray(), currentOffset, leavesEnd.getArray()[i] - currentOffset); + flushLeaveNodesUpTo(out, i); if (attrCount > 0) { sequenceSlotStub(abvs, attrCount); + int attributeLength = 0; for (int s = 0; s < attrCount; ++s) { ++i; - out.write(leavesABVS.getByteArray(), currentOffset, leavesEnd.getArray()[i] - currentOffset); - updateSequenceSlot(abvs); + attributeLength = leavesEnd.getArray()[i] - currentOffset; + updateSequenceSlot(abvs, attributeLength); } + flushLeaveNodesUpTo(out, i); } if (childrenCount > 0) { sequenceSlotStub(abvs, childrenCount); } // Continue with nodes. + startChildHunk(currentOffset); } else if (leavesKind.getArray()[i] == LEAF_POST_DOCUMENT) { + flushLeaveNodesUpTo(out, i - 1); + // no action } else if (leavesKind.getArray()[i] == LEAF_POST_NODE) { + flushLeaveNodesUpTo(out, i - 1); + updateSequenceSlot(abvs); } else { - out.write(leavesABVS.getByteArray(), currentOffset, leavesEnd.getArray()[i] - currentOffset); - updateSequenceSlot(abvs); + childrenLength = leavesEnd.getArray()[i] - currentOffset; + updateSequenceSlot(abvs, childrenLength); } + } + System.err.println("copyNewCounter: " + copyNewCounter); + } + + private void flushLeaveNodesUpTo(DataOutput out, int i) throws IOException { + if (currentOffset != leavesEnd.getArray()[i]) { + out.write(leavesABVS.getByteArray(), currentOffset, leavesEnd.getArray()[i] - currentOffset); currentOffset = leavesEnd.getArray()[i]; + copyNewCounter++; } } + int childrenLength = 0; + int childrenOffset = 0; + + private void startChildHunk(int currentOffset) { + childrenLength = 0; + childrenOffset = currentOffset; + } + private void sequenceSlotStub(ArrayBackedValueStorage abvs, int count) throws IOException { DataOutput out = abvs.getDataOutput(); out.writeInt(count); + // System.err.println("Slot count " + count); int offset = abvs.getLength(); for (int s = 0; s < count; ++s) { out.writeInt(-1); @@ -450,15 +610,51 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { childSlotCounter++; } - private void updateSequenceSlot(ArrayBackedValueStorage abvs) { + private void updateSequenceSlot(ArrayBackedValueStorage abvs, int length) { // for (int i = 0; i < childSlotCounter; ++i) { // System.err.println("\t" + i + " " + childStartOffset.getArray()[i] + " - " + childSlotOffset.getArray()[i]); // } childSlotCounter--; - int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter]; + // int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter]; IntegerPointable.setInteger(abvs.getByteArray(), childSlotOffset.getArray()[childSlotCounter], length); } + private void updateSequenceSlot(ArrayBackedValueStorage abvs) { + int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter]; + updateSequenceSlot(abvs, length); + } + + private ElementNodeBuilder createENB() { + if (freeENBList.isEmpty()) { + return new ElementNodeBuilder(); + } + return freeENBList.remove(freeENBList.size() - 1); + } + + private void freeENB(ElementNodeBuilder enb) { + freeENBList.add(enb); + } + + private ElementNodeBuilder peekENBStackTop() { + return enbStack.get(enbStack.size() - 1); + } + + private void startChildInParent(AbstractNodeBuilder anb) throws IOException { + if (enbStack.isEmpty()) { + docb.startChild(anb); + } else { + peekENBStackTop().startChild(anb); + } + } + + private void endChildInParent(AbstractNodeBuilder anb) throws IOException { + if (enbStack.isEmpty()) { + docb.endChild(anb); + } else { + peekENBStackTop().endChild(anb); + } + } + private void leafNodeStart(int kind) { leavesKind.append(kind); //leavesStart.append(leavesABVS.getLength());
