Finished the new SAXContentHandler with the option to hold copies when they are sequential.
Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/11664ee8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/11664ee8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/11664ee8 Branch: refs/heads/prestonc/parser Commit: 11664ee82020d48612f4505360b064895cc9cdcb Parents: 278c0db Author: Preston Carman <[email protected]> Authored: Wed Feb 19 17:10:45 2014 -0800 Committer: Preston Carman <[email protected]> Committed: Thu Feb 27 14:24:55 2014 -0800 ---------------------------------------------------------------------- .../vxquery/xmlparser/SAXContentHandler.java | 302 +++---------------- 1 file changed, 43 insertions(+), 259 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/11664ee8/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java index 2b3d613..e6ebee8 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java @@ -16,11 +16,8 @@ package org.apache.vxquery.xmlparser; import java.io.DataOutput; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import org.apache.vxquery.datamodel.accessors.nodes.NodeTreePointable; -import org.apache.vxquery.datamodel.builders.nodes.AbstractNodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.AttributeNodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.CommentNodeBuilder; import org.apache.vxquery.datamodel.builders.nodes.DictionaryBuilder; @@ -42,7 +39,7 @@ import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable; import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage; public class SAXContentHandler implements ContentHandler, LexicalHandler { - private final ArrayBackedValueStorage docABVS; + private static final int DEFAULT_INT_SIZE = 600; private final boolean createNodeIds; @@ -50,8 +47,6 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final ITreeNodeIdProvider nodeIdProvider; - private final ArrayBackedValueStorage tempABVS; - private final DocumentNodeBuilder docb; private final TextNodeBuilder tnb; @@ -66,13 +61,9 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final StringBuilder buffer; - private final List<ElementNodeBuilder> enbStack; - - private final List<ElementNodeBuilder> freeENBList; - private int nodeIdCounter; - private int copyOldCounter = 0; - private int copyNewCounter = 0; + + private int copyNewCounter; private boolean pendingText; @@ -82,10 +73,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { // Structure and data. private final GrowableIntArray leavesKind; - // private final GrowableIntArray leavesStart; private final GrowableIntArray leavesEnd; - // private final GrowableIntArray leavesDepth; - // private final GrowableIntArray leavesParent; private final GrowableIntArray leavesAttributeCount; private final GrowableIntArray leavesChildrenCount; @@ -93,6 +81,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private int textCount = 0; private int textCurrentDepth = 0; private int childSlotCounter = 0; + private int currentOffset = 0; private final GrowableIntArray previousLeaf; private final GrowableIntArray childStartOffset; private final GrowableIntArray childSlotOffset; @@ -108,11 +97,9 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final int LEAF_POST_NODE = 7; public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider) { - docABVS = new ArrayBackedValueStorage(); this.createNodeIds = nodeIdProvider != null; this.attachTypes = attachTypes; this.nodeIdProvider = nodeIdProvider; - this.tempABVS = new ArrayBackedValueStorage(); docb = new DocumentNodeBuilder(); tnb = new TextNodeBuilder(); cnb = new CommentNodeBuilder(); @@ -121,25 +108,19 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { anb = new AttributeNodeBuilder(); db = new DictionaryBuilder(); buffer = new StringBuilder(); - enbStack = new ArrayList<ElementNodeBuilder>(); - freeENBList = new ArrayList<ElementNodeBuilder>(); pendingText = false; - leavesKind = new GrowableIntArray(600); - //leavesStart = new GrowableIntArray(600); + leavesKind = new GrowableIntArray(DEFAULT_INT_SIZE); leavesABVS = new ArrayBackedValueStorage(); - leavesEnd = new GrowableIntArray(600); - // leavesDepth = new GrowableIntArray(600); - // leavesParent = new GrowableIntArray(600); - leavesAttributeCount = new GrowableIntArray(600); - leavesChildrenCount = new GrowableIntArray(600); - previousLeaf = new GrowableIntArray(600); - childStartOffset = new GrowableIntArray(600); - childSlotOffset = new GrowableIntArray(600); + leavesEnd = new GrowableIntArray(DEFAULT_INT_SIZE); + leavesAttributeCount = new GrowableIntArray(DEFAULT_INT_SIZE); + leavesChildrenCount = new GrowableIntArray(DEFAULT_INT_SIZE); + previousLeaf = new GrowableIntArray(DEFAULT_INT_SIZE); + childStartOffset = new GrowableIntArray(DEFAULT_INT_SIZE); + childSlotOffset = new GrowableIntArray(DEFAULT_INT_SIZE); textCount = 0; textCurrentDepth = 0; childSlotCounter = 0; - } @Override @@ -152,10 +133,6 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void endDocument() throws SAXException { try { flushText(); - docb.endChildrenChunk(); - copyOldCounter++; - docb.finish(); - leafNodeStart(LEAF_POST_DOCUMENT); leafNodeEnd(); textCurrentDepth--; @@ -163,30 +140,12 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { e.printStackTrace(); throw new SAXException(e); } - - // int[] k = leavesKind.getArray(); - // int[] s = leavesStart.getArray(); - // int[] e = leavesEnd.getArray(); - // int[] d = leavesDepth.getArray(); - // int[] p = leavesParent.getArray(); - // int[] c = leavesChildrenCount.getArray(); - // int[] a = leavesAttributeCount.getArray(); - // for (int i = 0; i < s.length; ++i) { - // System.err.println(i + " " + k[i] + " - " + d[i] + ":" + s[i] + ":" + e[i] + " p=" + p[i] + " a=" + a[i] - // + " c=" + c[i]); - // } } @Override public void endElement(String uri, String localName, String name) throws SAXException { try { flushText(); - ElementNodeBuilder enb = enbStack.remove(enbStack.size() - 1); - enb.endChildrenChunk(); - copyOldCounter++; - endChildInParent(enb); - freeENB(enb); - leafNodeStart(LEAF_POST_NODE); leafNodeEnd(); textCurrentDepth--; @@ -208,21 +167,6 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void processingInstruction(String target, String data) throws SAXException { try { flushText(); - startChildInParent(pinb); - tempABVS.reset(); - tempABVS.getDataOutput().writeUTF(target); - if (createNodeIds) { - pinb.setLocalNodeId(nodeIdCounter); - } - pinb.setTarget(tempABVS); - copyOldCounter++; - tempABVS.reset(); - tempABVS.getDataOutput().writeUTF(data); - pinb.setContent(tempABVS); - copyOldCounter++; - endChildInParent(pinb); - - // Save to leavesABVS leafNodeStart(LEAF_PI); pinb.reset(leavesABVS); if (createNodeIds) { @@ -247,14 +191,10 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startDocument() throws SAXException { - copyOldCounter = 0; copyNewCounter = 0; leavesKind.clear(); - //leavesStart.clear(); leavesABVS.reset(); leavesEnd.clear(); - // leavesDepth.clear(); - // leavesParent.clear(); leavesAttributeCount.clear(); leavesChildrenCount.clear(); previousLeaf.clear(); @@ -263,23 +203,17 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { textCount = 0; textCurrentDepth = 0; childSlotCounter = 0; + currentOffset = 0; textCurrentDepth++; try { nodeIdCounter = 0; db.reset(); - docABVS.reset(); - docb.reset(docABVS); - if (createNodeIds) { - docb.setLocalNodeId(nodeIdCounter++); - } - docb.startChildrenChunk(); leafNodeStart(LEAF_PRE_DOCUMENT); - DocumentNodeBuilder docb2 = new DocumentNodeBuilder(); - docb2.reset(leavesABVS); + docb.reset(leavesABVS); if (createNodeIds) { - docb2.setLocalNodeId(nodeIdCounter); + docb.setLocalNodeId(nodeIdCounter++); } leafNodeEnd(); @@ -297,11 +231,12 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { flushText(); int idx = name.indexOf(':'); String prefix = idx < 0 ? "" : name.substring(0, idx); - ElementNodeBuilder enb = createENB(); - startChildInParent(enb); int uriCode = db.lookup(uri); int localNameCode = db.lookup(localName); int prefixCode = db.lookup(prefix); + + leafNodeStart(LEAF_PRE_NODE); + enb.setMvs(leavesABVS); enb.setName(uriCode, localNameCode, prefixCode); if (attachTypes) { int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); @@ -312,21 +247,6 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { if (createNodeIds) { enb.setLocalNodeId(nodeIdCounter++); } - - // Save to leavesABVS - leafNodeStart(LEAF_PRE_NODE); - ElementNodeBuilder enb2 = createENB(); - enb2.setMvs(leavesABVS); - enb2.setName(uriCode, localNameCode, prefixCode); - if (attachTypes) { - int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); - int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_STR); - int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX); - enb2.setType(typeUriCode, typeLocalNameCode, typePrefixCode); - } - if (createNodeIds) { - enb2.setLocalNodeId(nodeIdCounter); - } leafNodeEnd(); textCurrentDepth++; @@ -338,27 +258,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { int aPrefixCode = db.lookup(aIdx < 0 ? "" : aName.substring(0, aIdx)); int aLocalNameCode = db.lookup(atts.getLocalName(i)); int aUriCode = db.lookup(atts.getURI(i)); - String aValue = atts.getValue(i); - tempABVS.reset(); - DataOutput tempOut = tempABVS.getDataOutput(); - tempOut.write(ValueTag.XS_UNTYPED_ATOMIC_TAG); - tempOut.writeUTF(aValue); - enb.startAttribute(anb); - anb.setName(aUriCode, aLocalNameCode, aPrefixCode); - if (attachTypes) { - int typeUriCode = db.lookup(XQueryConstants.XS_NSURI); - int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_ATOMIC_STR); - int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX); - anb.setType(typeUriCode, typeLocalNameCode, typePrefixCode); - } - if (createNodeIds) { - anb.setLocalNodeId(nodeIdCounter++); - } - anb.setValue(tempABVS); - copyOldCounter++; - enb.endAttribute(anb); - // Save to leavesABVS leafNodeStart(LEAF_ATTRIBUTE); anb.reset(leavesABVS); anb.setName(aUriCode, aLocalNameCode, aPrefixCode); @@ -373,12 +273,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } anb.setValue(atts.getValue(i)); leafNodeEnd(); - } - enb.endAttributeChunk(); - copyOldCounter++; - enb.startChildrenChunk(); - enbStack.add(enb); } catch (IOException e) { e.printStackTrace(); throw new SAXException(e); @@ -393,16 +288,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void comment(char[] ch, int start, int length) throws SAXException { try { flushText(); - startChildInParent(cnb); buffer.append(ch, start, length); - tempABVS.reset(); - tempABVS.getDataOutput().writeUTF(buffer.toString()); - if (createNodeIds) { - cnb.setLocalNodeId(nodeIdCounter); - } - cnb.setValue(tempABVS); - copyOldCounter++; - endChildInParent(cnb); // Save to leavesABVS leafNodeStart(LEAF_COMMENT); @@ -422,17 +308,6 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private void flushText() throws IOException { if (pendingText) { - peekENBStackTop().startChild(tnb); - tempABVS.reset(); - tempABVS.getDataOutput().writeUTF(buffer.toString()); - if (createNodeIds) { - tnb.setLocalNodeId(nodeIdCounter); - } - tnb.setValue(tempABVS); - copyOldCounter++; - peekENBStackTop().endChild(tnb); - - // Save to leavesABVS leafNodeStart(LEAF_TEXT); tnb.reset(leavesABVS); if (createNodeIds) { @@ -470,29 +345,31 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void startEntity(String name) throws SAXException { } - public void write(ArrayBackedValueStorage abvs) throws IOException { - DataOutput out = abvs.getDataOutput(); - out.write(ValueTag.NODE_TREE_TAG); - byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK; - if (attachTypes) { - header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK; - } - if (createNodeIds) { - header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK; + private void leafNodeStart(int kind) { + leavesKind.append(kind); + leavesAttributeCount.append(0); + leavesChildrenCount.append(0); + + int parent = previousLeaf.getArray()[textCurrentDepth - 1]; + if (kind == LEAF_POST_NODE || kind == LEAF_POST_DOCUMENT) { + // Skip Count + } else if (kind == LEAF_ATTRIBUTE) { + leavesAttributeCount.getArray()[parent] += 1; + } else if (textCount != parent) { + leavesChildrenCount.getArray()[parent] += 1; } - out.write(header); - if (createNodeIds) { - out.writeInt(nodeIdProvider.getId()); + // If it can have children. + if (kind == LEAF_PRE_NODE || kind == LEAF_PRE_DOCUMENT) { + previousLeaf.getArray()[textCurrentDepth] = textCount; } - db.write(abvs); - out.write(docABVS.getByteArray(), docABVS.getStartOffset(), docABVS.getLength()); - copyOldCounter++; - System.err.println("copyCounter: " + copyOldCounter); } - int currentOffset = 0; + private void leafNodeEnd() { + leavesEnd.append(leavesABVS.getLength()); + textCount++; + } - public void writeOnce(ArrayBackedValueStorage abvs) throws IOException { + public void write(ArrayBackedValueStorage abvs) throws IOException { DataOutput out = abvs.getDataOutput(); out.write(ValueTag.NODE_TREE_TAG); byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK; @@ -507,25 +384,17 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { out.writeInt(nodeIdProvider.getId()); } db.write(abvs); - copyNewCounter++; - // leavesStart.getArray()[i] for (int i = 0; i < leavesKind.getSize(); ++i) { if (leavesKind.getArray()[i] == LEAF_PRE_DOCUMENT) { flushLeaveNodesUpTo(out, i); - // for (int x = currentOffset; x < leavesEnd.getArray()[i]; ++x) { - // System.err.println(i + "\t" + leavesKind.getArray()[i] + "\t" + leavesABVS.getByteArray()[x]); - // } - int children = leavesChildrenCount.getArray()[i]; - System.err.println("children " + children); if (children > 0) { sequenceSlotStub(abvs, children); } // Continue with nodes. - childrenLength = 0; } else if (leavesKind.getArray()[i] == LEAF_PRE_NODE) { flushLeaveNodesUpTo(out, i - 1); @@ -541,11 +410,9 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { if (attrCount > 0) { sequenceSlotStub(abvs, attrCount); - int attributeLength = 0; for (int s = 0; s < attrCount; ++s) { ++i; - attributeLength = leavesEnd.getArray()[i] - currentOffset; - updateSequenceSlot(abvs, attributeLength); + updateSequenceSlot(abvs, leavesEnd.getArray()[i] - currentOffset); } flushLeaveNodesUpTo(out, i); } @@ -554,22 +421,19 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { sequenceSlotStub(abvs, childrenCount); } // Continue with nodes. - startChildHunk(currentOffset); } else if (leavesKind.getArray()[i] == LEAF_POST_DOCUMENT) { flushLeaveNodesUpTo(out, i - 1); - // no action } else if (leavesKind.getArray()[i] == LEAF_POST_NODE) { flushLeaveNodesUpTo(out, i - 1); - updateSequenceSlot(abvs); + updateSequenceSlot(abvs, 0); } else { - childrenLength = leavesEnd.getArray()[i] - currentOffset; - updateSequenceSlot(abvs, childrenLength); + updateSequenceSlot(abvs, leavesEnd.getArray()[i] - currentOffset); } } - System.err.println("copyNewCounter: " + copyNewCounter); +// System.err.println("copyNewCounter: " + copyNewCounter); } private void flushLeaveNodesUpTo(DataOutput out, int i) throws IOException { @@ -580,28 +444,14 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } } - int childrenLength = 0; - int childrenOffset = 0; - - private void startChildHunk(int currentOffset) { - childrenLength = 0; - childrenOffset = currentOffset; - } - private void sequenceSlotStub(ArrayBackedValueStorage abvs, int count) throws IOException { DataOutput out = abvs.getDataOutput(); out.writeInt(count); - // System.err.println("Slot count " + count); int offset = abvs.getLength(); for (int s = 0; s < count; ++s) { out.writeInt(-1); addSequenceSlot(offset, s, count); } - // - // for (int i = 0; i < childSlotCounter; ++i) { - // System.err.println("\t\t" + i + " " + childStartOffset.getArray()[i] + " - " - // + childSlotOffset.getArray()[i]); - // } } private void addSequenceSlot(int offset, int count, int total) { @@ -610,76 +460,10 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { childSlotCounter++; } - private void updateSequenceSlot(ArrayBackedValueStorage abvs, int length) { - // for (int i = 0; i < childSlotCounter; ++i) { - // System.err.println("\t" + i + " " + childStartOffset.getArray()[i] + " - " + childSlotOffset.getArray()[i]); - // } + private void updateSequenceSlot(ArrayBackedValueStorage abvs, int additional) { childSlotCounter--; - // int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter]; + int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter] + additional; IntegerPointable.setInteger(abvs.getByteArray(), childSlotOffset.getArray()[childSlotCounter], length); } - private void updateSequenceSlot(ArrayBackedValueStorage abvs) { - int length = abvs.getLength() - childStartOffset.getArray()[childSlotCounter]; - updateSequenceSlot(abvs, length); - } - - private ElementNodeBuilder createENB() { - if (freeENBList.isEmpty()) { - return new ElementNodeBuilder(); - } - return freeENBList.remove(freeENBList.size() - 1); - } - - private void freeENB(ElementNodeBuilder enb) { - freeENBList.add(enb); - } - - private ElementNodeBuilder peekENBStackTop() { - return enbStack.get(enbStack.size() - 1); - } - - private void startChildInParent(AbstractNodeBuilder anb) throws IOException { - if (enbStack.isEmpty()) { - docb.startChild(anb); - } else { - peekENBStackTop().startChild(anb); - } - } - - private void endChildInParent(AbstractNodeBuilder anb) throws IOException { - if (enbStack.isEmpty()) { - docb.endChild(anb); - } else { - peekENBStackTop().endChild(anb); - } - } - - private void leafNodeStart(int kind) { - leavesKind.append(kind); - //leavesStart.append(leavesABVS.getLength()); - // leavesDepth.append(textCurrentDepth); - leavesAttributeCount.append(0); - leavesChildrenCount.append(0); - - int parent = previousLeaf.getArray()[textCurrentDepth - 1]; - // leavesParent.append(parent); - if (kind == LEAF_POST_NODE || kind == LEAF_POST_DOCUMENT) { - // Skip Count - } else if (kind == LEAF_ATTRIBUTE) { - leavesAttributeCount.getArray()[parent] += 1; - } else if (textCount != parent) { - leavesChildrenCount.getArray()[parent] += 1; - } - // If it can have children. - if (kind == LEAF_PRE_NODE || kind == LEAF_PRE_DOCUMENT) { - previousLeaf.getArray()[textCurrentDepth] = textCount; - } - } - - private void leafNodeEnd() { - leavesEnd.append(leavesABVS.getLength()); - textCount++; - } - } \ No newline at end of file
