Repository: incubator-vxquery Updated Branches: refs/heads/prestonc/benchmark cf09bd9ae -> 8b86884a1
More clean up and fixes for new SAXContentHandler features. - Made doc and doc-available work with the old SAXContentHandler method. (Basically the element writer does not affect the pervious version.) - Better variable and function naming. - Attempt to make it clean to create XMLParser and SAXContentHandler. Project: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/commit/8b86884a Tree: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/tree/8b86884a Diff: http://git-wip-us.apache.org/repos/asf/incubator-vxquery/diff/8b86884a Branch: refs/heads/prestonc/benchmark Commit: 8b86884a11ccf08e6b315fefdfb62184083f3f0d Parents: cf09bd9 Author: Preston Carman <[email protected]> Authored: Tue May 20 18:42:34 2014 -0700 Committer: Preston Carman <[email protected]> Committed: Tue May 20 18:42:34 2014 -0700 ---------------------------------------------------------------------- .../VXQueryCollectionOperatorDescriptor.java | 10 +- .../FnDocAvailableScalarEvaluatorFactory.java | 10 +- .../runtime/functions/util/FunctionHelper.java | 4 +- .../vxquery/xmlparser/SAXContentHandler.java | 96 +++++++++++--------- .../org/apache/vxquery/xmlparser/XMLParser.java | 25 ++++- 5 files changed, 88 insertions(+), 57 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/8b86884a/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java b/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java index 1f4bb2f..a9c7255 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/metadata/VXQueryCollectionOperatorDescriptor.java @@ -71,20 +71,16 @@ public class VXQueryCollectionOperatorDescriptor extends AbstractSingleActivityO final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider(partitionId, dataSourceId, totalDataSources); final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId(); final DynamicContext dCtx = (DynamicContext) ctx.getJobletContext().getGlobalJobData(); - final List<SequenceType> childSequenceTypes = new ArrayList<SequenceType>(); final String collectionName = collectionPartitions[partition % collectionPartitions.length]; - final XMLParser parser = new XMLParser(false, nodeIdProvider);; + final XMLParser parser = new XMLParser(false, nodeIdProvider, frame, appender, childSeq, + dCtx.getStaticContext()); return new AbstractUnaryInputUnaryOutputOperatorNodePushable() { @Override public void open() throws HyracksDataException { appender.reset(frame, true); writer.open(); - - for (int typeCode : childSeq) { - childSequenceTypes.add(dCtx.getStaticContext().lookupSequenceType(typeCode)); - } } @Override @@ -100,7 +96,7 @@ public class VXQueryCollectionOperatorDescriptor extends AbstractSingleActivityO Iterator<File> it = FileUtils.iterateFiles(collectionDirectory, new VXQueryIOFileFilter(), TrueFileFilter.INSTANCE); while (it.hasNext()) { - parser.parseOutElements(it.next(), frame, appender, writer, fta, t, childSequenceTypes); + parser.parseOutElements(it.next(), writer, fta, t); } } } else { http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/8b86884a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/node/FnDocAvailableScalarEvaluatorFactory.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/node/FnDocAvailableScalarEvaluatorFactory.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/node/FnDocAvailableScalarEvaluatorFactory.java index 499119c..ad4b1f0 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/node/FnDocAvailableScalarEvaluatorFactory.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/node/FnDocAvailableScalarEvaluatorFactory.java @@ -27,6 +27,9 @@ import org.apache.vxquery.exceptions.SystemException; import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluator; import org.apache.vxquery.runtime.functions.base.AbstractTaggedValueArgumentScalarEvaluatorFactory; import org.apache.vxquery.runtime.functions.util.FunctionHelper; +import org.apache.vxquery.xmlparser.ITreeNodeIdProvider; +import org.apache.vxquery.xmlparser.TreeNodeIdProvider; +import org.apache.vxquery.xmlparser.XMLParser; import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException; import edu.uci.ics.hyracks.algebricks.runtime.base.IScalarEvaluator; @@ -52,6 +55,8 @@ public class FnDocAvailableScalarEvaluatorFactory extends AbstractTaggedValueArg final SequencePointable seqp = (SequencePointable) SequencePointable.FACTORY.createPointable(); final ByteBufferInputStream bbis = new ByteBufferInputStream(); final DataInputStream di = new DataInputStream(bbis); + final int partition = ctx.getTaskAttemptId().getTaskId().getPartition(); + final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider((short) partition); return new AbstractTaggedValueArgumentScalarEvaluator(args) { @Override @@ -69,9 +74,10 @@ public class FnDocAvailableScalarEvaluatorFactory extends AbstractTaggedValueArg if (tvp.getTag() != ValueTag.XS_STRING_TAG) { throw new SystemException(ErrorCode.FORG0006); } - + tvp.getValue(stringp); try { - FunctionHelper.readInDocFromPointable(stringp, bbis, di, abvs, null); + XMLParser parser = new XMLParser(false, nodeIdProvider); + FunctionHelper.readInDocFromPointable(stringp, bbis, di, abvs, parser); XDMConstants.setTrue(result); } catch (Exception e) { XDMConstants.setFalse(result); http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/8b86884a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java index 4953c12..8074eab 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/runtime/functions/util/FunctionHelper.java @@ -1232,8 +1232,8 @@ public class FunctionHelper { System.err.println(" printUTF8String END"); } - public static void readInDocFromPointable(UTF8StringPointable stringp, ByteBufferInputStream bbis, DataInputStream di, - ArrayBackedValueStorage abvs, XMLParser parser) throws HyracksDataException { + public static void readInDocFromPointable(UTF8StringPointable stringp, ByteBufferInputStream bbis, + DataInputStream di, ArrayBackedValueStorage abvs, XMLParser parser) throws HyracksDataException { String fName; try { fName = getStringFromPointable(stringp, bbis, di); http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/8b86884a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java index 58a4f03..bd1a354 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/SAXContentHandler.java @@ -96,7 +96,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { private final ArrayBackedValueStorage resultABVS; - private boolean writeMode; + private boolean skipping; private boolean[] subElement = null; @@ -129,12 +129,20 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { freeENBList = new ArrayList<ElementNodeBuilder>(); pendingText = false; tvp = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable(); - writeMode = false; + skipping = true; + } + + public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider, ByteBuffer frame, + FrameTupleAppender appender, List<SequenceType> childSequenceTypes) { + this(attachTypes, nodeIdProvider); + this.frame = frame; + this.appender = appender; + setChildPathSteps(childSequenceTypes); } @Override public void characters(char[] ch, int start, int length) throws SAXException { - if (writeMode) { + if (!skipping) { buffer.append(ch, start, length); pendingText = true; } @@ -142,12 +150,14 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void endDocument() throws SAXException { - if (writeMode) { + if (!skipping) { try { flushText(); docb.endChildrenChunk(); docb.finish(); - writeElement(); + if (frame != null && appender != null) { + writeElement(); + } } catch (IOException e) { e.printStackTrace(); throw new SAXException(e); @@ -155,22 +165,25 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } } + private void endElementChildPathStep() throws IOException { + if (foundFirstNonSkippedElement()) { + writeElement(); + } + if (subElement != null && depth <= subElement.length) { + subElement[depth - 1] = false; + } + } + @Override public void endElement(String uri, String localName, String name) throws SAXException { - if (writeMode) { + if (!skipping) { try { flushText(); ElementNodeBuilder enb = enbStack.remove(enbStack.size() - 1); enb.endChildrenChunk(); endChildInParent(enb); - - if (foundChildPathStep()) { - writeElement(); - } - if (subElement != null && depth <= subElement.length) { - subElement[depth - 1] = false; - } freeENB(enb); + endElementChildPathStep(); } catch (IOException e) { e.printStackTrace(); throw new SAXException(e); @@ -189,7 +202,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void processingInstruction(String target, String data) throws SAXException { - if (writeMode) { + if (!skipping) { try { flushText(); startChildInParent(pinb); @@ -221,43 +234,47 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void startDocument() throws SAXException { if (subElement == null) { - writeMode = true; + skipping = false; } - try { - db.reset(); - docABVS.reset(); - if (writeMode) { + db.reset(); + docABVS.reset(); + if (!skipping) { + try { docb.reset(docABVS); if (createNodeIds) { docb.setLocalNodeId(nodeIdCounter++); } docb.startChildrenChunk(); flushText(); + } catch (IOException e) { + e.printStackTrace(); + throw new SAXException(e); } - } catch (IOException e) { - e.printStackTrace(); - throw new SAXException(e); } } - @Override - public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { - depth++; - // Check path step if it exists. + private boolean startElementChildPathStep(String uri, String localName) { if (subElement != null && depth <= subElement.length) { + // Check path step if it exists. if (uri.compareTo(childUri[depth - 1]) == 0) { if (localName.compareTo(childLocalName[depth - 1]) == 0) { subElement[depth - 1] = true; } } } - - boolean start = foundChildPathStep(); + boolean start = foundFirstNonSkippedElement(); if (start) { - writeMode = true; + skipping = false; } + return start; + } - if (writeMode) { + @Override + public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { + depth++; + boolean start = startElementChildPathStep(uri, localName); + + if (!skipping) { try { flushText(); int idx = name.indexOf(':'); @@ -320,7 +337,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { @Override public void comment(char[] ch, int start, int length) throws SAXException { - if (writeMode) { + if (!skipping) { try { flushText(); startChildInParent(cnb); @@ -379,7 +396,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { public void startEntity(String name) throws SAXException { } - public void setChildPathSteps(List<SequenceType> childSeq) { + private void setChildPathSteps(List<SequenceType> childSeq) { // this.childSeq = childSeq; if (!childSeq.isEmpty()) { subElement = new boolean[childSeq.size()]; @@ -398,10 +415,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } } - public void setupElementWriter(ByteBuffer frame, FrameTupleAppender appender, IFrameWriter writer, - FrameTupleAccessor fta, int t) throws IOException { - this.frame = frame; - this.appender = appender; + public void setupElementWriter(IFrameWriter writer, FrameTupleAccessor fta, int t) { this.writer = writer; this.fta = fta; this.t = t; @@ -430,7 +444,7 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { } tvp.set(resultABVS.getByteArray(), resultABVS.getStartOffset(), resultABVS.getLength()); addNodeToTuple(tvp, t); - writeMode = false; + skipping = true; } public void writeDocument(ArrayBackedValueStorage abvs) throws IOException { @@ -470,8 +484,8 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { startChildInParent(anb, false); } - private void startChildInParent(AbstractNodeBuilder anb, boolean track) throws IOException { - if (track) { + private void startChildInParent(AbstractNodeBuilder anb, boolean startNewElement) throws IOException { + if (startNewElement) { elementABVS.reset(); anb.reset(elementABVS); } else if (enbStack.isEmpty()) { @@ -521,8 +535,8 @@ public class SAXContentHandler implements ContentHandler, LexicalHandler { /** * Determines if the correct path step is active. */ - private boolean foundChildPathStep() { - if (subElement.length != depth) { + private boolean foundFirstNonSkippedElement() { + if (subElement == null || subElement.length != depth) { // Not the correct depth. return false; } http://git-wip-us.apache.org/repos/asf/incubator-vxquery/blob/8b86884a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java ---------------------------------------------------------------------- diff --git a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java index edef1a1..c1fd6f0 100644 --- a/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java +++ b/vxquery-core/src/main/java/org/apache/vxquery/xmlparser/XMLParser.java @@ -20,9 +20,11 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; +import org.apache.vxquery.context.StaticContext; import org.apache.vxquery.exceptions.VXQueryFileNotFoundException; import org.apache.vxquery.exceptions.VXQueryParseException; import org.apache.vxquery.types.SequenceType; @@ -43,9 +45,23 @@ public class XMLParser { final InputSource in; public XMLParser(boolean attachTypes, ITreeNodeIdProvider idProvider) throws HyracksDataException { + this(attachTypes, idProvider, null, null, null, null); + } + + public XMLParser(boolean attachTypes, ITreeNodeIdProvider idProvider, ByteBuffer frame, + FrameTupleAppender appender, List<Integer> childSeq, StaticContext staticContext) + throws HyracksDataException { try { parser = XMLReaderFactory.createXMLReader(); - handler = new SAXContentHandler(attachTypes, idProvider); + if (frame == null || appender == null) { + handler = new SAXContentHandler(attachTypes, idProvider); + } else { + List<SequenceType> childSequenceTypes = new ArrayList<SequenceType>(); + for (int typeCode : childSeq) { + childSequenceTypes.add(staticContext.lookupSequenceType(typeCode)); + } + handler = new SAXContentHandler(attachTypes, idProvider, frame, appender, childSequenceTypes); + } parser.setContentHandler(handler); parser.setProperty("http://xml.org/sax/properties/lexical-handler", handler); in = new InputSource(); @@ -72,16 +88,15 @@ public class XMLParser { } } - public void parseOutElements(File file, ByteBuffer frame, FrameTupleAppender appender, IFrameWriter writer, - FrameTupleAccessor fta, int t, List<SequenceType> childSeq) throws HyracksDataException { + public void parseOutElements(File file, IFrameWriter writer, FrameTupleAccessor fta, int t) + throws HyracksDataException { try { if (file.getName().toLowerCase().endsWith(".xml.gz")) { in.setCharacterStream(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)))); } else { in.setCharacterStream(new InputStreamReader(new FileInputStream(file))); } - handler.setChildPathSteps(childSeq); - handler.setupElementWriter(frame, appender, writer, fta, t); + handler.setupElementWriter(writer, fta, t); parser.parse(in); } catch (FileNotFoundException e) { throw new VXQueryFileNotFoundException(e, file);
