Author: mreutegg Date: Fri May 2 08:41:29 2014 New Revision: 1591826 URL: http://svn.apache.org/r1591826 Log: OAK-1790: Import of compressed wikipedia dump
Modified: jackrabbit/oak/trunk/oak-run/pom.xml jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Modified: jackrabbit/oak/trunk/oak-run/pom.xml URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/pom.xml?rev=1591826&r1=1591825&r2=1591826&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/pom.xml (original) +++ jackrabbit/oak/trunk/oak-run/pom.xml Fri May 2 08:41:29 2014 @@ -183,6 +183,11 @@ <version>2.0</version> </dependency> <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <version>1.8</version> + </dependency> + <dependency> <groupId>org.eclipse.jetty</groupId> <artifactId>jetty-servlet</artifactId> <version>${jetty.version}</version> Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java?rev=1591826&r1=1591825&r2=1591826&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java (original) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/benchmark/wikipedia/WikipediaImport.java Fri May 2 08:41:29 2014 @@ -19,7 +19,9 @@ package org.apache.jackrabbit.oak.benchm import static com.google.common.base.Preconditions.checkState; import static java.lang.Math.min; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import javax.jcr.Node; import javax.jcr.NodeIterator; @@ -32,6 +34,7 @@ import javax.xml.stream.XMLStreamConstan import javax.xml.stream.XMLStreamReader; import javax.xml.transform.stream.StreamSource; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.jackrabbit.commons.JcrUtils; import org.apache.jackrabbit.oak.benchmark.Benchmark; import org.apache.jackrabbit.oak.fixture.RepositoryFixture; @@ -101,7 +104,7 @@ public class WikipediaImport extends Ben } String type = "nt:unstructured"; - if (flat) { + if (session.getWorkspace().getNodeTypeManager().hasNodeType("oak:Unstructured")) { type = "oak:Unstructured"; } Node wikipedia = session.getRootNode().addNode("wikipedia", type); @@ -118,8 +121,15 @@ public class WikipediaImport extends Ben String title = null; String text = null; XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLStreamReader reader = - factory.createXMLStreamReader(new StreamSource(dump)); + StreamSource source; + if (dump.getName().endsWith(".xml")) { + source = new StreamSource(dump); + } else { + CompressorStreamFactory csf = new CompressorStreamFactory(); + source = new StreamSource(csf.createCompressorInputStream( + new BufferedInputStream(new FileInputStream(dump)))); + } + XMLStreamReader reader = factory.createXMLStreamReader(source); while (reader.hasNext()) { switch (reader.next()) { case XMLStreamConstants.START_ELEMENT: