Author: chetanm Date: Fri Jul 10 11:46:03 2015 New Revision: 1690249 URL: http://svn.apache.org/r1690249 Log: OAK-2953 - Implement text extractor as part of oak-run
-- Add Tika dependency to oak-run -- Ensure that various parser related dependency do not get pulled in while building oak-run. Changed assembly config for that -- Exposed a new 'tika' command Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (with props) jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java (with props) jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java (with props) jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java (with props) jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (with props) jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java (with props) jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java (with props) Modified: jackrabbit/oak/trunk/oak-run/pom.xml jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml Modified: jackrabbit/oak/trunk/oak-run/pom.xml URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/pom.xml?rev=1690249&r1=1690248&r2=1690249&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/pom.xml (original) +++ jackrabbit/oak/trunk/oak-run/pom.xml Fri Jul 10 11:46:03 2015 @@ -362,6 +362,22 @@ <scope>compile</scope> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>1.5</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>1.5</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + <version>1.1</version> + </dependency> + <!-- Findbugs annotations --> <dependency> <groupId>com.google.code.findbugs</groupId> Modified: jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml?rev=1690249&r1=1690248&r2=1690249&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml (original) +++ jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml Fri Jul 10 11:46:03 2015 @@ -38,11 +38,13 @@ <excludes> <exclude>org.apache.jackrabbit:oak-lucene</exclude> <exclude>org.apache.lucene</exclude> + <exclude>org.apache.tika</exclude> </excludes> <useStrictFiltering>true</useStrictFiltering> <useProjectArtifact>true</useProjectArtifact> <unpack>true</unpack> <useTransitiveDependencies>true</useTransitiveDependencies> + <useTransitiveFiltering>true</useTransitiveFiltering> <unpackOptions> <excludes> <exclude>META-INF/*.SF</exclude> Modified: jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml?rev=1690249&r1=1690248&r2=1690249&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml (original) +++ jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml Fri Jul 10 11:46:03 2015 @@ -33,11 +33,14 @@ <exclude>org.apache.jackrabbit:jackrabbit-core</exclude> <exclude>org.apache.lucene</exclude> <exclude>org.apache.derby</exclude> + <exclude>org.apache.tika:tika-core:*</exclude> + <exclude>org.apache.tika:tika-parsers:*</exclude> </excludes> <useStrictFiltering>true</useStrictFiltering> <useProjectArtifact>true</useProjectArtifact> <unpack>true</unpack> <useTransitiveDependencies>true</useTransitiveDependencies> + <useTransitiveFiltering>true</useTransitiveFiltering> <unpackOptions> <excludes> <exclude>META-INF/*.SF</exclude> @@ -51,5 +54,24 @@ </excludes> </unpackOptions> </dependencySet> + <!-- Exclude the transitive dependency as tika-parsers depend + on many other jars. Instead users can include tika-app.jar in classpath--> + <dependencySet> + <outputDirectory>/</outputDirectory> + <includes> + <include>org.apache.tika:tika-core</include> + <include>org.apache.tika:tika-parsers</include> + </includes> + <useStrictFiltering>true</useStrictFiltering> + <useTransitiveDependencies>false</useTransitiveDependencies> + <unpack>true</unpack> + <unpackOptions> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </unpackOptions> + </dependencySet> </dependencySets> </assembly> Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import javax.annotation.CheckForNull; +import javax.annotation.Nullable; + +import com.google.common.io.ByteSource; + +import static com.google.common.base.Preconditions.checkNotNull; + +class BinaryResource { + private final ByteSource byteSource; + private final String mimeType; + private final String encoding; + private final String path; + private final String blobId; + + public BinaryResource(ByteSource byteSource, + @Nullable String mimeType, + @Nullable String encoding, + String path, + String blobId) { + this.byteSource = checkNotNull(byteSource, "ByteSource must be provided"); + this.mimeType = mimeType; + this.encoding = encoding; + this.path = checkNotNull(path, "Path must be provided"); + this.blobId = checkNotNull(blobId, "BlobId must be specified"); + } + + public ByteSource getByteSource() { + return byteSource; + } + + @CheckForNull + public String getMimeType() { + return mimeType; + } + + @CheckForNull + public String getEncoding() { + return encoding; + } + + public String getPath() { + return path; + } + + public String getBlobId() { + return blobId; + } + + @Override + public String toString() { + return path; + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.IOException; + +import com.google.common.collect.FluentIterable; + +/** + * Provides an iterator for binaries present under given path + */ +interface BinaryResourceProvider { + + FluentIterable<BinaryResource> getBinaries(String path) throws IOException; +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import com.google.common.base.Strings; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; +import org.codehaus.groovy.runtime.StringGroovyMethods; + +import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount; + +class BinaryStats { + private final TikaHelper tika; + private final List<MimeTypeStats> stats; + private long totalSize; + private long totalCount; + private long indexedSize; + private long indexedCount; + + public BinaryStats(File tikaConfig, BinaryResourceProvider provider) throws IOException { + this.tika = new TikaHelper(tikaConfig); + this.stats = collectStats(provider); + } + + public long getTotalSize() { + return totalSize; + } + + public long getTotalCount() { + return totalCount; + } + + public long getIndexedSize() { + return indexedSize; + } + + public long getIndexedCount() { + return indexedCount; + } + + public String getSummary() throws IOException { + return getSummary(stats); + } + + private List<MimeTypeStats> collectStats(BinaryResourceProvider provider) throws IOException { + Map<String, MimeTypeStats> stats = Maps.newHashMap(); + for (BinaryResource binary : provider.getBinaries("/")) { + String mimeType = binary.getMimeType(); + if (mimeType != null) { + MimeTypeStats mimeStats = stats.get(mimeType); + if (mimeStats == null) { + mimeStats = createStat(mimeType); + stats.put(mimeType, mimeStats); + } + + long size = binary.getByteSource().size(); + mimeStats.addSize(size); + totalSize += size; + totalCount++; + + if (mimeStats.isIndexed()) { + indexedSize += size; + indexedCount++; + } + } + } + + List<MimeTypeStats> result = new ArrayList<MimeTypeStats>(stats.values()); + Collections.sort(result, Collections.reverseOrder()); + return result; + } + + private String getSummary(List<MimeTypeStats> stats) { + int maxWidth = 0; + for (MimeTypeStats s : stats) { + maxWidth = Math.max(maxWidth, s.getName().length()); + } + + maxWidth += 5; + + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + pw.println("MimeType Stats"); + pw.printf("\tTotal size : %s%n", humanReadableByteCount(totalSize)); + pw.printf("\tTotal indexed size : %s%n", humanReadableByteCount(indexedSize)); + pw.printf("\tTotal count : %d%n", totalCount); + pw.printf("\tTotal indexed count : %d%n", indexedCount); + pw.println(); + + String header = center("Type", maxWidth) + " " + + center("Indexed", 10) + " " + + center("Supported", 10) + " " + + center("Count", 10) + " " + + center("Size", 10); + + pw.println(header); + pw.println(Strings.repeat("_", header.length() + 5)); + + for (MimeTypeStats s : stats) { + pw.printf("%-" + maxWidth + "s|%10s|%10s| %-8d|%10s%n", + s.getName(), + s.isIndexed(), + s.isSupported(), + s.getCount(), + humanReadableByteCount(s.getTotalSize())); + } + return sw.toString(); + } + + private MimeTypeStats createStat(String mimeType) { + MimeTypeStats stats = new MimeTypeStats(mimeType); + stats.setIndexed(tika.isIndexed(mimeType)); + stats.setSupported(tika.isSupportedMediaType(mimeType)); + return stats; + } + + private static String center(String s, int width) { + return StringGroovyMethods.center(s, width); + } + + private static class MimeTypeStats implements Comparable<MimeTypeStats> { + private final String mimeType; + private int count; + private long totalSize; + private boolean supported; + private boolean indexed; + + public MimeTypeStats(String mimeType) { + this.mimeType = mimeType; + } + + public void addSize(long size) { + count++; + totalSize += size; + } + + public void setSupported(boolean supported) { + this.supported = supported; + } + + public void setIndexed(boolean indexed) { + this.indexed = indexed; + } + + public long getTotalSize() { + return totalSize; + } + + public int getCount() { + return count; + } + + public String getName() { + return mimeType; + } + + public boolean isIndexed() { + return indexed; + } + + public boolean isSupported() { + return supported; + } + + @Override + public int compareTo(MimeTypeStats o) { + return ComparisonChain.start() + .compareFalseFirst(indexed, o.indexed) + .compare(totalSize, o.totalSize) + .result(); + } + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.IOException; +import java.io.InputStream; + +import javax.annotation.Nullable; + +import com.google.common.io.ByteSource; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; + +/** + * Avoiding use of BlobByteSource to avoid concurrent access to NodeState + */ +class BlobStoreByteSource extends ByteSource { + private final BlobStore blobStore; + private final String blobId; + private final Long size; + + BlobStoreByteSource(BlobStore blobStore, String blobId,@Nullable Long size) { + this.blobStore = blobStore; + this.blobId = blobId; + this.size = size; + } + + BlobStoreByteSource(BlobStore blobStore, String blobId) { + this(blobStore, blobId, null); + } + + @Override + public InputStream openStream() throws IOException { + return blobStore.getInputStream(blobId); + } + + @Override + public long size() throws IOException { + if (size != null) { + return size; + } + return blobStore.getBlobLength(blobId); + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; + +import javax.annotation.Nullable; + +import com.google.common.base.Charsets; +import com.google.common.base.Function; +import com.google.common.base.Predicate; +import com.google.common.collect.FluentIterable; +import com.google.common.io.Closer; +import com.google.common.primitives.Longs; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.apache.jackrabbit.oak.commons.PathUtils; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Predicates.notNull; +import static org.apache.jackrabbit.JcrConstants.JCR_ENCODING; +import static org.apache.jackrabbit.JcrConstants.JCR_MIMETYPE; +import static org.apache.jackrabbit.JcrConstants.JCR_PATH; + +class CSVFileBinaryResourceProvider implements BinaryResourceProvider, Closeable { + private static final String BLOB_ID = "blobId"; + private static final String LENGTH = "length"; + static final CSVFormat FORMAT = CSVFormat.DEFAULT + .withCommentMarker('#') + .withHeader( + BLOB_ID, + LENGTH, + JCR_MIMETYPE, + JCR_ENCODING, + JCR_PATH + ) + .withNullString("") //Empty string are considered as null + .withIgnoreSurroundingSpaces() + .withSkipHeaderRecord(); + private final Logger log = LoggerFactory.getLogger(getClass()); + private final File dataFile; + private final BlobStore blobStore; + private final Closer closer = Closer.create(); + + public CSVFileBinaryResourceProvider(File dataFile, @Nullable BlobStore blobStore) { + checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile); + this.dataFile = dataFile; + this.blobStore = blobStore; + } + + @Override + public FluentIterable<BinaryResource> getBinaries(final String path) throws IOException { + CSVParser parser = CSVParser.parse(dataFile, Charsets.UTF_8, FORMAT); + closer.register(parser); + return FluentIterable.from(parser) + .transform(new RecordTransformer()) + .filter(notNull()) + .filter(new Predicate<BinaryResource>() { + @Override + public boolean apply(BinaryResource input) { + return PathUtils.isAncestor(path, input.getPath()); + } + }); + } + + @Override + public void close() throws IOException { + closer.close(); + } + + private class RecordTransformer implements Function<CSVRecord, BinaryResource> { + + @Nullable + @Override + public BinaryResource apply(CSVRecord input) { + String path = input.get(JCR_PATH); + String mimeType = input.get(JCR_MIMETYPE); + String encoding = input.get(JCR_ENCODING); + String blobId = input.get(BLOB_ID); + String length = input.get(LENGTH); + Long len = length != null ? Longs.tryParse(length) : null; + if (path == null || blobId == null || mimeType == null) { + log.warn("Ignoring invalid record {}. Either of mimeType, blobId or path is null", input); + return null; + } + + return new BinaryResource(new BlobStoreByteSource(blobStore, blobId, len), + mimeType, encoding, path, blobId); + } + } + +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import javax.annotation.CheckForNull; +import javax.annotation.Nullable; + +import com.google.common.base.Function; +import com.google.common.collect.FluentIterable; +import com.google.common.collect.TreeTraverser; +import org.apache.jackrabbit.JcrConstants; +import org.apache.jackrabbit.oak.api.Blob; +import org.apache.jackrabbit.oak.api.PropertyState; +import org.apache.jackrabbit.oak.api.Tree; +import org.apache.jackrabbit.oak.api.Type; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; +import org.apache.jackrabbit.oak.spi.state.NodeStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.common.base.Predicates.notNull; +import static org.apache.jackrabbit.oak.plugins.tree.TreeFactory.createReadOnlyTree; +import static org.apache.jackrabbit.oak.spi.state.NodeStateUtils.getNode; + +class NodeStoreBinaryResourceProvider implements BinaryResourceProvider { + private static final Logger log = LoggerFactory.getLogger(NodeStoreBinaryResourceProvider.class); + private final NodeStore nodeStore; + private final BlobStore blobStore; + + public NodeStoreBinaryResourceProvider(NodeStore nodeStore, BlobStore blobStore) { + this.nodeStore = nodeStore; + this.blobStore = blobStore; + } + + public FluentIterable<BinaryResource> getBinaries(String path) { + return new OakTreeTraverser() + .preOrderTraversal(createReadOnlyTree(getNode(nodeStore.getRoot(), path))) + .transform(new TreeToBinarySource()) + .filter(notNull()); + } + + private class TreeToBinarySource implements Function<Tree, BinaryResource> { + @Nullable + @Override + public BinaryResource apply(Tree tree) { + PropertyState data = tree.getProperty(JcrConstants.JCR_DATA); + if (data == null) { + return null; + } + + if (data.isArray()) { + log.debug("Ignoring jcr:data property at {} as its a MVP", tree.getPath()); + return null; + } + + Blob blob = data.getValue(Type.BINARY); + String blobId = blob.getContentIdentity(); + if (blobId == null) { + log.debug("Ignoring jcr:data property at {} as its an inlined blob", tree.getPath()); + return null; + } + + String mimeType = getString(tree, JcrConstants.JCR_MIMETYPE); + String encoding = getString(tree, JcrConstants.JCR_ENCODING); + + return new BinaryResource(new BlobStoreByteSource(blobStore, blobId), mimeType, + encoding, tree.getPath(), blobId); + } + } + + private static class OakTreeTraverser extends TreeTraverser<Tree> { + @Override + public Iterable<Tree> children(Tree root) { + return root.getChildren(); + } + } + + @CheckForNull + private static String getString(Tree tree, String name) { + PropertyState prop = tree.getProperty(name); + return prop != null ? prop.getValue(Type.STRING) : null; + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import com.google.common.io.ByteSource; +import com.google.common.io.CountingInputStream; +import org.apache.jackrabbit.oak.commons.IOUtils; +import org.apache.jackrabbit.oak.commons.io.LazyInputStream; +import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.WriteOutContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class TextExtractor implements Closeable { + private static final Logger log = LoggerFactory.getLogger(TextExtractor.class); + private static final Logger parserError = LoggerFactory.getLogger("org.apache.jackrabbit.oak.plugins.tika.ParserError"); + private static final int PROGRESS_BATCH_SIZE = 1000; + private static final int MAX_EXTRACT_LENGTH = 100000; + private static final String ERROR_TEXT = "TextExtractionError"; + + private final TextWriter textWriter; + + private final WorkItem SHUTDOWN_SIGNAL = new WorkItem(null); + private BlockingQueue<WorkItem> inputQueue; + private ExecutorService executorService; + private int threadPoolSize = Runtime.getRuntime().availableProcessors(); + private int queueSize = 100; + + private final AtomicInteger errorCount = new AtomicInteger(); + private final AtomicLong timeTaken = new AtomicLong(); + private final AtomicInteger extractionCount = new AtomicInteger(); + private final AtomicInteger textWrittenCount = new AtomicInteger(); + private final AtomicInteger parserErrorCount = new AtomicInteger(); + private final AtomicInteger processedCount = new AtomicInteger(); + private final AtomicInteger emptyCount = new AtomicInteger(); + private final AtomicInteger notSupportedCount = new AtomicInteger(); + private final AtomicInteger alreadyExtractedCount = new AtomicInteger(); + private final AtomicLong extractedTextSize = new AtomicLong(); + private final AtomicLong nonEmptyExtractedTextSize = new AtomicLong(); + private final AtomicLong totalSizeRead = new AtomicLong(); + + private int maxExtractedLength = MAX_EXTRACT_LENGTH; + private File tikaConfig; + private TikaHelper tika; + private boolean initialized; + private BinaryStats stats; + private boolean closed; + + public TextExtractor(TextWriter textWriter) { + this.textWriter = textWriter; + } + + public void extract(Iterable<BinaryResource> binaries) throws InterruptedException, IOException { + initialize(); + for (BinaryResource binary : binaries) { + inputQueue.put(new WorkItem(binary)); + } + } + + @Override + public void close() { + if (closed) { + return; + } + if (!inputQueue.isEmpty()) { + log.info("Shutting down the extractor. Pending task count {}", inputQueue.size()); + } + + if (executorService != null) { + try { + inputQueue.put(SHUTDOWN_SIGNAL); + executorService.shutdown(); + //Wait long enough + executorService.awaitTermination(10, TimeUnit.DAYS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + dumpStats(); + closed = true; + } + + public void setTikaConfig(File tikaConfig) { + this.tikaConfig = tikaConfig; + } + + public void setThreadPoolSize(int threadPoolSize) { + this.threadPoolSize = threadPoolSize; + } + + public void setStats(BinaryStats stats) { + this.stats = stats; + } + + private void dumpStats() { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + pw.println("Text extraction stats"); + pw.printf("\t Processed Count : %d%n", processedCount.get()); + pw.printf("\t Extraction Count : %d%n", extractionCount.get()); + pw.printf("\t Empty Count : %d%n", emptyCount.get()); + pw.printf("\t Text Written Count : %d%n", textWrittenCount.get()); + pw.printf("\t Parser Error Count : %d%n", parserErrorCount.get()); + pw.printf("\t Error Count : %d%n", errorCount.get()); + pw.printf("\t Not Supported Count : %d%n", notSupportedCount.get()); + pw.printf("\t Already processed Count : %d%n", alreadyExtractedCount.get()); + pw.printf("\t Total bytes read : %s%n", IOUtils.humanReadableByteCount(totalSizeRead.get())); + pw.printf("\t Total text extracted : %s%n", IOUtils.humanReadableByteCount(extractedTextSize.get())); + pw.printf("\t Non empty text : %s%n", IOUtils.humanReadableByteCount(nonEmptyExtractedTextSize.get())); + pw.printf("\t Time taken : %d sec%n", timeTaken.get() / 1000); + pw.close(); + log.info(sw.toString()); + } + + private void dumpProgress(int count) { + if (count % PROGRESS_BATCH_SIZE == 0) { + String progress = ""; + if (stats != null) { + double processedPercent = count * 1.0 / stats.getTotalCount() * 100; + double indexedPercent = extractionCount.get() * 1.0 / stats.getIndexedCount() * 100; + progress = String.format("(%1.2f%%) (Extraction stats %d/%d %1.2f%%, Ignored count %d)", + processedPercent, extractionCount.get(), stats.getIndexedCount(), + indexedPercent, notSupportedCount.get()); + } + log.info("Processed {} {} binaries so far ...", count, progress); + } + } + + private synchronized void initialize() throws IOException { + if (initialized) { + return; + } + inputQueue = new ArrayBlockingQueue<WorkItem>(queueSize); + tika = new TikaHelper(tikaConfig); + initializeExecutorService(); + initialized = true; + } + + private void extractText(BinaryResource source) throws IOException { + String type = source.getMimeType(); + if (type == null || !tika.isSupportedMediaType(type)) { + log.trace("Ignoring binary content for node {} due to unsupported " + + "(or null) jcr:mimeType [{}]", source, type); + notSupportedCount.incrementAndGet(); + return; + } + + String blobId = source.getBlobId(); + if (textWriter.isProcessed(blobId)) { + alreadyExtractedCount.incrementAndGet(); + return; + } + + //TODO Handle case where same blob is being concurrently processed + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, type); + if (source.getEncoding() != null) { // not mandatory + metadata.set(Metadata.CONTENT_ENCODING, source.getEncoding()); + } + + String extractedContent = parseStringValue(source.getByteSource(), metadata, source.getPath()); + if (ERROR_TEXT.equals(extractedContent)) { + textWriter.markError(blobId); + } else if (extractedContent != null) { + extractedContent = extractedContent.trim(); + if (!extractedContent.isEmpty()) { + nonEmptyExtractedTextSize.addAndGet(extractedContent.length()); + textWriter.write(blobId, extractedContent); + textWrittenCount.incrementAndGet(); + } else { + textWriter.markEmpty(blobId); + emptyCount.incrementAndGet(); + } + } + } + + private void initializeExecutorService() { + executorService = Executors.newFixedThreadPool(threadPoolSize); + for (int i = 0; i < threadPoolSize; i++) { + executorService.submit(new Extractor()); + } + log.info("Initialized text extractor pool with {} threads", threadPoolSize); + } + + private class Extractor implements Runnable { + @Override + public void run() { + while (true) { + WorkItem workItem = null; + try { + workItem = inputQueue.take(); + if (workItem == SHUTDOWN_SIGNAL) { + inputQueue.put(SHUTDOWN_SIGNAL); //put back for other workers + return; + } + extractText(workItem.source); + dumpProgress(processedCount.incrementAndGet()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } catch (Exception e) { + errorCount.incrementAndGet(); + log.warn("Error occurred while processing {}", workItem, e); + } + } + } + } + + //~--------------------------------------< Tika > + + private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) { + WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength); + long start = System.currentTimeMillis(); + long size = 0; + try { + CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource)); + try { + tika.getParser().parse(stream, handler, metadata, new ParseContext()); + } finally { + size = stream.getCount(); + stream.close(); + } + } catch (LinkageError e) { + // Capture and ignore errors caused by extraction libraries + // not being present. This is equivalent to disabling + // selected media types in configuration, so we can simply + // ignore these errors. + } catch (Throwable t) { + // Capture and report any other full text extraction problems. + // The special STOP exception is used for normal termination. + if (!handler.isWriteLimitReached(t)) { + parserErrorCount.incrementAndGet(); + parserError.debug("Failed to extract text from a binary property: " + + path + + " This is a fairly common case, and nothing to" + + " worry about. The stack trace is included to" + + " help improve the text extraction feature.", t); + return ERROR_TEXT; + } + } + String result = handler.toString(); + timeTaken.addAndGet(System.currentTimeMillis() - start); + if (size > 0) { + extractedTextSize.addAndGet(result.length()); + extractionCount.incrementAndGet(); + totalSizeRead.addAndGet(size); + return result; + } + + return null; + } + + //~--------------------------------------< WorkItem > + + private static class WorkItem { + final BinaryResource source; + + private WorkItem(BinaryResource source) { + this.source = source; + } + + @Override + public String toString() { + return source != null ? source.toString() : "<EMPTY>"; + } + } + +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.Closeable; +import java.io.File; +import java.util.List; + +import com.google.common.io.Closer; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.apache.jackrabbit.core.data.FileDataStore; +import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore; +import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static java.util.Arrays.asList; + +public class TextExtractorMain { + private static final Logger log = LoggerFactory.getLogger(TextExtractorMain.class); + + public static void main(String[] args) throws Exception { + Closer closer = Closer.create(); + String h = "tika [extract|report|generate]\n" + + "\n" + + "report : Generates a summary report related to binary data\n" + + "extract : Performs the text extraction\n" + + "generate : Generates the csv data file based on configured NodeStore/BlobStore"; + try { + OptionParser parser = new OptionParser(); + OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"), + "show help").forHelp(); + + OptionSpec<String> nodeStoreSpec = parser + .accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database") + .withRequiredArg() + .ofType(String.class); + + OptionSpec<String> pathSpec = parser + .accepts("path", "Path in repository under which the binaries would be searched") + .withRequiredArg() + .ofType(String.class); + + OptionSpec<File> dataFileSpec = parser + .accepts("data-file", "Data file in csv format containing the binary metadata") + .withRequiredArg() + .ofType(File.class); + + OptionSpec<File> tikaConfigSpec = parser + .accepts("tika-config", "Tika config file path") + .withRequiredArg() + .ofType(File.class); + + OptionSpec<File> fdsDirSpec = parser + .accepts("fds-path", "Path of directory used by FileDataStore") + .withRequiredArg() + .ofType(File.class); + + OptionSpec<File> storeDirSpec = parser + .accepts("store-path", "Path of directory used to store extracted text content") + .withRequiredArg() + .ofType(File.class); + + OptionSpec<Integer> poolSize = parser + .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " + + "to number of cores on the system") + .withRequiredArg() + .ofType(Integer.class); + + //TODO implement generate support + + OptionSpec<String> nonOption = parser.nonOptions(h); + + OptionSet options = parser.parse(args); + List<String> nonOptions = nonOption.values(options); + + if (options.has(help)) { + parser.printHelpOn(System.out); + System.exit(0); + } + + if (nonOptions.isEmpty()) { + parser.printHelpOn(System.err); + System.exit(1); + } + + boolean report = nonOptions.contains("report"); + boolean extract = nonOptions.contains("extract"); + File dataFile; + File fdsDir; + File storeDir = null; + File tikaConfigFile = null; + BlobStore blobStore = null; + BinaryResourceProvider binaryResourceProvider = null; + BinaryStats stats = null; + String path = "/"; + + if (options.has(tikaConfigSpec)) { + tikaConfigFile = tikaConfigSpec.value(options); + checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist", + tikaConfigFile.getAbsolutePath()); + } + + if (options.has(storeDirSpec)) { + storeDir = storeDirSpec.value(options); + if (storeDir.exists()) { + checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " + + "text content '%s' is not a directory", storeDir.getAbsolutePath(), storeDirSpec.options()); + } + } + + if (options.has(fdsDirSpec)) { + fdsDir = fdsDirSpec.value(options); + checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath()); + FileDataStore fds = new FileDataStore(); + fds.setPath(fdsDir.getAbsolutePath()); + fds.init(null); + blobStore = new DataStoreBlobStore(fds); + } + + if (options.has(dataFileSpec)) { + dataFile = dataFileSpec.value(options); + checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath()); + binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore); + } + + if (binaryResourceProvider instanceof Closeable) { + closer.register((Closeable) binaryResourceProvider); + } + + if (report || extract) { + checkNotNull(binaryResourceProvider, "BinaryProvider source must be specified either " + + "via '%s' or '%s", dataFileSpec.options(), nodeStoreSpec.options()); + + stats = new BinaryStats(tikaConfigFile, binaryResourceProvider); + String summary = stats.getSummary(); + log.info(summary); + } + + if (extract) { + checkNotNull(storeDir, "Directory to store extracted text content " + + "must be specified via %s", storeDirSpec.options()); + checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + + "must be specified via %s", fdsDirSpec.options()); + + DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false); + TextExtractor extractor = new TextExtractor(writer); + + if (options.has(poolSize)) { + extractor.setThreadPoolSize(poolSize.value(options)); + } + + if (tikaConfigFile != null) { + extractor.setTikaConfig(tikaConfigFile); + } + + if (options.has(pathSpec)) { + path = pathSpec.value(options); + } + + closer.register(writer); + closer.register(extractor); + + extractor.setStats(stats); + log.info("Using path {}", path); + extractor.extract(binaryResourceProvider.getBinaries(path)); + + extractor.close(); + writer.close(); + } + + } catch (Throwable e) { + throw closer.rethrow(e); + } finally { + closer.close(); + } + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; + +import javax.annotation.Nullable; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; + +class TikaHelper { + private static final String DEFAULT_TIKA_CONFIG = "/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml"; + private static final Logger log = LoggerFactory.getLogger(TikaHelper.class); + + private final AutoDetectParser parser; + private final Set<MediaType> supportedMediaTypes; + private static AtomicBoolean supportedTypesLogged = new AtomicBoolean(); + + public TikaHelper(@Nullable File tikaConfig) throws IOException { + try { + parser = new AutoDetectParser(getTikaConfig(tikaConfig)); + supportedMediaTypes = parser.getSupportedTypes(new ParseContext()); + logSupportedTypesOnce(supportedMediaTypes); + } catch (TikaException e) { + throw new RuntimeException(e); + } catch (SAXException e) { + throw new RuntimeException(e); + } + } + + public Parser getParser() { + return parser; + } + + public boolean isSupportedMediaType(String type) { + return supportedMediaTypes.contains(MediaType.parse(type)); + } + + /** + * This method should only be used for information purpose and not be relied + * upon to determine if the given type is indexed or not. It relies on Tika + * implementation detail to determine if a given type is meant to be indexed + * + * @param type mimeType to check + * @return true if the given type is supported and indexed + */ + public boolean isIndexed(String type) { + if (!isSupportedMediaType(type)){ + return false; + } + + MediaType mediaType = MediaType.parse(type); + Parser p = getSupportingParser(parser, mediaType); + if (p == null){ + return false; + } + p = unwrap(p); + if (p instanceof EmptyParser){ + return false; + } + return true; + } + + private static TikaConfig getTikaConfig(File tikaConfig) throws TikaException, IOException, SAXException { + TikaConfig config; + if (tikaConfig == null) { + URL configUrl = TextExtractor.class.getResource(DEFAULT_TIKA_CONFIG); + if (configUrl != null) { + log.info("Loading default Tika config from {}", configUrl); + config = new TikaConfig(configUrl); + } else { + log.info("Using default Tika config"); + config = TikaConfig.getDefaultConfig(); + } + } else { + log.info("Loading external Tika config from {}", tikaConfig); + config = new TikaConfig(tikaConfig); + } + return config; + } + + private static Parser getSupportingParser(Parser p, MediaType mediaType){ + if (p instanceof CompositeParser){ + Map<MediaType, Parser> parsers = ((CompositeParser) p).getParsers(); + return getSupportingParser(parsers.get(mediaType), mediaType); + } + return p; + } + + private static Parser unwrap(Parser p){ + if (p instanceof ParserDecorator){ + return unwrap(((ParserDecorator) p).getWrappedParser()); + } + return p; + } + + private static void logSupportedTypesOnce(Set<MediaType> supportedMediaTypes) { + boolean alreadyLogged = supportedTypesLogged.getAndSet(true); + if (!alreadyLogged) { + log.info("Supported media types {}", supportedMediaTypes); + } + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java?rev=1690249&r1=1690248&r2=1690249&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java (original) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java Fri Jul 10 11:46:03 2015 @@ -101,6 +101,7 @@ import org.apache.jackrabbit.oak.plugins import org.apache.jackrabbit.oak.plugins.segment.standby.server.StandbyServer; import org.apache.jackrabbit.oak.remote.content.ContentRemoteRepository; import org.apache.jackrabbit.oak.remote.http.RemoteServlet; +import org.apache.jackrabbit.oak.plugins.tika.TextExtractorMain; import org.apache.jackrabbit.oak.scalability.ScalabilityRunner; import org.apache.jackrabbit.oak.spi.state.NodeState; import org.apache.jackrabbit.oak.spi.state.NodeStore; @@ -189,6 +190,9 @@ public final class Main { case REPAIR: repair(args); break; + case TIKA: + TextExtractorMain.main(args); + break; case HELP: default: System.err.print("Available run modes: "); @@ -1180,7 +1184,8 @@ public final class Main { HELP("help"), CHECKPOINTS("checkpoints"), RECOVERY("recovery"), - REPAIR("repair"); + REPAIR("repair"), + TIKA("tika"); private final String name; Modified: jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml?rev=1690249&r1=1690248&r2=1690249&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml (original) +++ jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml Fri Jul 10 11:46:03 2015 @@ -36,6 +36,8 @@ <!-- Display info messages from the scalability suite --> <logger name="org.apache.jackrabbit.oak.scalability" level="INFO"/> + <logger name="org.apache.jackrabbit.oak.plugins.tika" level="INFO"/> + <logger name="org.apache.jackrabbit.oak.plugins.segment.file.tooling.ConsistencyChecker" level="DEBUG"/> <root level="warn"> Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java (added) +++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import com.google.common.base.Function; + +public enum BinarySourceMapper implements Function<BinaryResource, String> { + BY_BLOBID { + @Override + public String apply(BinaryResource input) { + return input.getBlobId(); + } + }, + + BY_PATH { + @Override + public String apply(BinaryResource input) { + return input.getPath(); + } + } + +} Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java (added) +++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.File; +import java.util.Map; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import org.apache.commons.csv.CSVPrinter; +import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.junit.Assert.assertEquals; + +public class CSVFileBinaryResourceProviderTest { + + @Rule + public final TemporaryFolder temporaryFolder = new TemporaryFolder(); + + @Test + public void testGetBinaries() throws Exception { + StringBuilder sb = new StringBuilder(); + CSVPrinter p = new CSVPrinter(sb, CSVFileBinaryResourceProvider.FORMAT); + // BLOB_ID, LENGTH, JCR_MIMETYPE, JCR_ENCODING, JCR_PATH + p.printRecord("a", 123, "text/plain", null, "/a"); + p.printRecord("a2", 123, "text/plain", null, "/a/c"); + p.printRecord("b", null, "text/plain", null, "/b"); + p.printRecord(null, null, "text/plain", null, "/c"); + + File dataFile = temporaryFolder.newFile(); + Files.write(sb, dataFile, Charsets.UTF_8); + + CSVFileBinaryResourceProvider provider = new CSVFileBinaryResourceProvider(dataFile, new MemoryBlobStore()); + + Map<String, BinaryResource> binaries = provider.getBinaries("/").uniqueIndex(BinarySourceMapper.BY_BLOBID); + assertEquals(3, binaries.size()); + assertEquals("a", binaries.get("a").getBlobId()); + assertEquals("/a", binaries.get("a").getPath()); + + binaries = provider.getBinaries("/a").uniqueIndex(BinarySourceMapper.BY_BLOBID); + assertEquals(1, binaries.size()); + + provider.close(); + } +} \ No newline at end of file Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (added) +++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import org.apache.jackrabbit.JcrConstants; +import org.apache.jackrabbit.oak.api.Blob; +import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob; +import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; +import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore; +import org.apache.jackrabbit.oak.spi.state.NodeBuilder; +import org.apache.jackrabbit.oak.spi.state.NodeState; +import org.apache.jackrabbit.oak.spi.state.NodeStore; +import org.junit.Test; + +import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT; +import static org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent.INITIAL_CONTENT; +import static org.junit.Assert.assertEquals; + +public class NodeStoreBinaryResourceProviderTest { + private NodeState root = INITIAL_CONTENT; + + @Test + public void countBinaries() throws Exception { + NodeBuilder builder = root.builder(); + createFileNode(builder, "a", new IdBlob("hello", null), "text/plain"); + createFileNode(builder, "b", new IdBlob("hello", "id1"), "text/plain"); + + createFileNode(builder.child("a2"), "c", new IdBlob("hello", "id2"), "text/foo") + .setProperty(JcrConstants.JCR_ENCODING, "bar"); + + NodeStore store = new MemoryNodeStore(builder.getNodeState()); + BlobStore blobStore = new MemoryBlobStore(); + NodeStoreBinaryResourceProvider extractor = new NodeStoreBinaryResourceProvider(store, blobStore); + + assertEquals(2, extractor.getBinaries("/").size()); + assertEquals(1, extractor.getBinaries("/a2").size()); + + BinaryResource bs = extractor.getBinaries("/a2").first().get(); + assertEquals("text/foo", bs.getMimeType()); + assertEquals("bar", bs.getEncoding()); + assertEquals("id2", bs.getBlobId()); + + } + + private NodeBuilder createFileNode(NodeBuilder base, String name, Blob content, String mimeType) { + NodeBuilder jcrContent = base.child(name).child(JCR_CONTENT); + jcrContent.setProperty(JcrConstants.JCR_DATA, content); + jcrContent.setProperty(JcrConstants.JCR_MIMETYPE, mimeType); + return jcrContent; + } + + private static class IdBlob extends ArrayBasedBlob { + final String id; + + public IdBlob(String value, String id) { + super(value.getBytes()); + this.id = id; + } + + @Override + public String getContentIdentity() { + return id; + } + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java (added) +++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Maps; +import com.google.common.io.ByteSource; +import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter; +import org.junit.Test; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; + +public class TextExtractorTest { + + @Test + public void basicWorking() throws Exception { + MapTextWriter writer = new MapTextWriter(); + TextExtractor extractor = new TextExtractor(writer); + + List<BinaryResource> binaries = asList( + bin("hello", "text/plain", "a"), + bin("foo", "text/plain", "b") + ); + + extractor.extract(binaries); + + extractor.close(); + assertEquals(2, writer.data.size()); + assertEquals("foo", writer.data.get("b").trim()); + } + + private static BinaryResource bin(String text, String mime, String id) { + return new BinaryResource(ByteSource.wrap(text.getBytes()), mime, null, id, id); + } + + private static class MapTextWriter implements TextWriter { + final Map<String, String> data = Maps.newConcurrentMap(); + + @Override + public void write(String blobId, String text) throws IOException { + data.put(blobId, text); + } + + @Override + public void markEmpty(String blobId) { + + } + + @Override + public void markError(String blobId) { + + } + + @Override + public boolean isProcessed(String blobId) { + return data.containsKey(blobId); + } + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java ------------------------------------------------------------------------------ svn:eol-style = native Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java?rev=1690249&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java (added) +++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java Fri Jul 10 11:46:03 2015 @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.File; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class TikaHelperTest { + @Rule + public final TemporaryFolder temporaryFolder = new TemporaryFolder(); + + @Test + public void supportedTypes() throws Exception { + TikaHelper tika = new TikaHelper(null); + assertTrue(tika.isSupportedMediaType("text/plain")); + } + + @Test + public void indexedTypes() throws Exception { + File config = temporaryFolder.newFile(); + String configText = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + + "<properties>\n" + + " <detectors>\n" + + " <detector class=\"org.apache.tika.detect.DefaultDetector\"/>\n" + + " </detectors>\n" + + " <parsers>\n" + + " <parser class=\"org.apache.tika.parser.DefaultParser\"/>\n" + + " <parser class=\"org.apache.tika.parser.EmptyParser\">\n" + + " <mime>application/xml</mime>\n" + + " </parser>\n" + + " </parsers>\n" + + "</properties>"; + Files.write(configText, config, Charsets.UTF_8); + TikaHelper tika = new TikaHelper(config); + assertFalse(tika.isIndexed("application/xml")); + } + +} \ No newline at end of file Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java ------------------------------------------------------------------------------ svn:eol-style = native