Repository: cxf Updated Branches: refs/heads/master e0a449ec7 -> 785c0bd70
[CXF-5549] Moving Lucene DocumentMetadata to its own class and supporting injecting it via the constructor as discussed with Andriy, updating TikaContentExtractor to accept multile parsers Project: http://git-wip-us.apache.org/repos/asf/cxf/repo Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/8f99f309 Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/8f99f309 Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/8f99f309 Branch: refs/heads/master Commit: 8f99f30970db784a6d741fe76dbca517585d31cd Parents: e0a449e Author: Sergey Beryozkin <[email protected]> Authored: Fri Jun 27 16:47:49 2014 +0100 Committer: Sergey Beryozkin <[email protected]> Committed: Fri Jun 27 16:47:49 2014 +0100 ---------------------------------------------------------------------- .../ext/search/tika/TikaContentExtractor.java | 62 ++++++-- .../search/tika/TikaLuceneContentExtractor.java | 148 ++++++++++--------- .../tika/TikaLuceneContentExtractorTest.java | 4 +- 3 files changed, 128 insertions(+), 86 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cxf/blob/8f99f309/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java index e7cb623..266457e 100644 --- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java +++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java @@ -20,6 +20,8 @@ package org.apache.cxf.jaxrs.ext.search.tika; import java.io.IOException; import java.io.InputStream; +import java.util.Collections; +import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; @@ -28,6 +30,7 @@ import org.xml.sax.SAXException; import org.apache.cxf.common.logging.LogUtils; import org.apache.cxf.jaxrs.ext.search.SearchBean; import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -38,9 +41,8 @@ import org.apache.tika.sax.ToTextContentHandler; public class TikaContentExtractor { private static final Logger LOG = LogUtils.getL7dLogger(TikaContentExtractor.class); - private final Parser parser; - private final DefaultDetector detector; - private final boolean validateMediaType; + private final List<Parser> parsers; + private final Detector detector; /** * Create new Tika-based content extractor using the provided parser instance. @@ -51,17 +53,32 @@ public class TikaContentExtractor { } /** + * Create new Tika-based content extractor using the provided parser instances. + * @param parsers parser instances + */ + public TikaContentExtractor(final List<Parser> parsers) { + this(parsers, new DefaultDetector()); + } + + /** + * Create new Tika-based content extractor using the provided parser instances. + * @param parsers parser instances + */ + public TikaContentExtractor(final List<Parser> parsers, Detector detector) { + this.parsers = parsers; + this.detector = detector; + } + + /** * Create new Tika-based content extractor using the provided parser instance and - * optional media type validation. If validation is enabled, the implementation + * optional media type validation. If validation is enabled, the implementation parser * will try to detect the media type of the input and validate it against media types * supported by the parser. * @param parser parser instance - * @param validateMediaType enabled or disable media type validation + * @param validateMediaType enabled or disable media type validationparser */ public TikaContentExtractor(final Parser parser, final boolean validateMediaType) { - this.parser = parser; - this.validateMediaType = validateMediaType; - this.detector = validateMediaType ? new DefaultDetector() : null; + this(Collections.singletonList(parser), validateMediaType ? new DefaultDetector() : null); } /** @@ -111,18 +128,28 @@ public class TikaContentExtractor { final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); - // Try to validate that input stream media type is supported by the parser - if (validateMediaType) { - final MediaType mediaType = detector.detect(in, metadata); - if (mediaType == null || !parser.getSupportedTypes(context).contains(mediaType)) { - return null; + // Try to validate that input stream media type is supported by the parser + MediaType mediaType = null; + Parser parser = null; + for (Parser p : parsers) { + if (detector != null) { + mediaType = detector.detect(in, metadata); + if (mediaType != null && p.getSupportedTypes(context).contains(mediaType)) { + parser = p; + break; + } + } else { + parser = p; } } + if (parser == null) { + return null; + } final ToTextContentHandler handler = extractContent ? new ToTextContentHandler() : new IgnoreContentHandler(); parser.parse(in, handler, metadata, context); - return new TikaContent(handler.toString(), metadata); + return new TikaContent(handler.toString(), metadata, mediaType); } catch (final IOException ex) { LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex); } catch (final SAXException ex) { @@ -136,9 +163,11 @@ public class TikaContentExtractor { public static class TikaContent { private String content; private Metadata metadata; - public TikaContent(String content, Metadata metadata) { + private MediaType mediaType; + public TikaContent(String content, Metadata metadata, MediaType mediaType) { this.content = content; this.metadata = metadata; + this.mediaType = mediaType; } public String getContent() { return content; @@ -146,6 +175,9 @@ public class TikaContentExtractor { public Metadata getMetadata() { return metadata; } + public MediaType getMediaType() { + return mediaType; + } } private static class IgnoreContentHandler extends ToTextContentHandler { http://git-wip-us.apache.org/repos/asf/cxf/blob/8f99f309/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java index 8911df8..28eaa35 100644 --- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java +++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java @@ -20,8 +20,6 @@ package org.apache.cxf.jaxrs.ext.search.tika; import java.io.InputStream; import java.util.Date; -import java.util.LinkedHashMap; -import java.util.Map; import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent; import org.apache.lucene.document.Document; @@ -37,55 +35,9 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; public class TikaLuceneContentExtractor { - private final DocumentMetadata defaultDocumentMetadata; + private final LuceneDocumentMetadata defaultDocumentMetadata; private final TikaContentExtractor extractor; - public static class DocumentMetadata { - private final Map< String, Class< ? > > fieldTypes = - new LinkedHashMap< String, Class< ? > >(); - private final String contentFieldName; - - public DocumentMetadata(final String contentFieldName) { - this.contentFieldName = contentFieldName; - } - - public DocumentMetadata withField(final String name, final Class< ? > type) { - fieldTypes.put(name, type); - return this; - } - - public String getContentFieldName() { - return contentFieldName; - } - - private Field contentField(final String content) { - return new TextField(contentFieldName, content, Store.YES); - } - - private Field field(final String name, final String value) { - final Class< ? > type = fieldTypes.get(name); - - if (type != null) { - if (Number.class.isAssignableFrom(type)) { - if (Double.class.isAssignableFrom(type)) { - return new DoubleField(name, Double.valueOf(value), Store.YES); - } else if (Float.class.isAssignableFrom(type)) { - return new FloatField(name, Float.valueOf(value), Store.YES); - } else if (Long.class.isAssignableFrom(type)) { - return new LongField(name, Long.valueOf(value), Store.YES); - } else if (Integer.class.isAssignableFrom(type)) { - return new IntField(name, Integer.valueOf(value), Store.YES); - } - } else if (Date.class.isAssignableFrom(type)) { - return new StringField(name, value, Store.YES); - } - } - - return new StringField(name, value, Store.YES); - } - } - - /** * Create new Tika-based content extractor using the provided parser instance. * @param parser parser instance @@ -97,13 +49,13 @@ public class TikaLuceneContentExtractor { /** * Create new Tika-based content extractor using the provided parser instance and * optional media type validation. If validation is enabled, the implementation - * will try to detect the media type of the input and validate it against media types + * will try to detect the media type of the input and validate it against media typesthis.contentFieldName * supported by the parser. * @param parser parser instance * @param validateMediaType enabled or disable media type validation */ public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType) { - this(parser, validateMediaType, "contents"); + this(parser, validateMediaType, new LuceneDocumentMetadata()); } /** @@ -111,14 +63,28 @@ public class TikaLuceneContentExtractor { * optional media type validation. If validation is enabled, the implementation * will try to detect the media type of the input and validate it against media types * supported by the parser. - * @param parser parser instance + * @param parser parser instancethis.contentFieldName + * @param documentMetadata documentMetadata + */ + public TikaLuceneContentExtractor(final Parser parser, + final LuceneDocumentMetadata documentMetadata) { + this(parser, false, new LuceneDocumentMetadata()); + } + + /** + * Create new Tika-based content extractor using the provided parser instance and + * optional media type validation. If validation is enabled, the implementation + * will try to detect the media type of the input and validate it against media types + * supported by the parser. + * @param parser parser instancethis.contentFieldName * @param validateMediaType enabled or disable media type validation - * @param contentFieldName name of the content field, default is "contents" + * @param documentMetadata documentMetadata */ - public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType, - final String contentFieldName) { - extractor = new TikaContentExtractor(parser, validateMediaType); - defaultDocumentMetadata = new DocumentMetadata(contentFieldName); + public TikaLuceneContentExtractor(final Parser parser, + final boolean validateMediaType, + final LuceneDocumentMetadata documentMetadata) { + this.extractor = new TikaContentExtractor(parser, validateMediaType); + this.defaultDocumentMetadata = documentMetadata; } /** @@ -129,20 +95,19 @@ public class TikaLuceneContentExtractor { * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extract(final InputStream in) { - return extractAll(in, defaultDocumentMetadata, true, true); + return extractAll(in, null, true, true); } /** - * Extract the content and metadata from the input stream using DocumentMetadata descriptor to - * create a document with strongly typed fields. Depending on media type validation, + * Extract the content and metadata from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the content and metadata from - * @param metadata document descriptor with field names and their types + * @param documentMetadata documentMetadata * @return the extracted document or null if extraction is not possible or was unsuccessful */ - public Document extract(final InputStream in, final DocumentMetadata metadata) { - return extractAll(in, metadata, true, true); + public Document extract(final InputStream in, final LuceneDocumentMetadata documentMetadata) { + return extractAll(in, documentMetadata, true, true); } /** @@ -153,7 +118,7 @@ public class TikaLuceneContentExtractor { * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extractContent(final InputStream in) { - return extractAll(in, defaultDocumentMetadata, true, false); + return extractAll(in, null, true, false); } /** @@ -164,11 +129,25 @@ public class TikaLuceneContentExtractor { * @return the extracted document or null if extraction is not possible or was unsuccessful */ public Document extractMetadata(final InputStream in) { - return extractAll(in, defaultDocumentMetadata, false, true); + return extractAll(in, null, false, true); } - private Document extractAll(final InputStream in, final DocumentMetadata documentMetadata, - boolean extractContent, boolean extractMetadata) { + /** + * Extract the metadata only from the input stream. Depending on media type validation, + * the detector could be run against input stream in order to ensure that parser supports this + * type of content. + * @param in input stream to extract the metadata from + * @param documentMetadata documentMetadata + * @return the extracted document or null if extraction is not possible or was unsuccessful + */ + public Document extractMetadata(final InputStream in, final LuceneDocumentMetadata documentMetadata) { + return extractAll(in, documentMetadata, false, true); + } + + private Document extractAll(final InputStream in, + LuceneDocumentMetadata documentMetadata, + boolean extractContent, + boolean extractMetadata) { TikaContent content = extractor.extractAll(in, extractContent); @@ -176,18 +155,49 @@ public class TikaLuceneContentExtractor { return null; } final Document document = new Document(); + + if (documentMetadata == null) { + documentMetadata = defaultDocumentMetadata; + } if (content.getContent() != null) { - document.add(documentMetadata.contentField(content.getContent())); + document.add(getContentField(documentMetadata, content.getContent())); } if (extractMetadata) { Metadata metadata = content.getMetadata(); for (final String property: metadata.names()) { - document.add(documentMetadata.field(property, metadata.get(property))); + document.add(getField(documentMetadata, property, metadata.get(property))); } } return document; } + + private static Field getContentField(final LuceneDocumentMetadata documentMetadata, final String content) { + return new TextField(documentMetadata.getContentFieldName(), content, Store.YES); + } + + private static Field getField(final LuceneDocumentMetadata documentMetadata, + final String name, final String value) { + final Class< ? > type = documentMetadata.getFieldType(name); + + if (type != null) { + if (Number.class.isAssignableFrom(type)) { + if (Double.class.isAssignableFrom(type)) { + return new DoubleField(name, Double.valueOf(value), Store.YES); + } else if (Float.class.isAssignableFrom(type)) { + return new FloatField(name, Float.valueOf(value), Store.YES); + } else if (Long.class.isAssignableFrom(type)) { + return new LongField(name, Long.valueOf(value), Store.YES); + } else if (Integer.class.isAssignableFrom(type)) { + return new IntField(name, Integer.valueOf(value), Store.YES); + } + } else if (Date.class.isAssignableFrom(type)) { + return new StringField(name, value, Store.YES); + } + } + + return new StringField(name, value, Store.YES); + } } http://git-wip-us.apache.org/repos/asf/cxf/blob/8f99f309/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java ---------------------------------------------------------------------- diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java index 3ebe02d..ef36439 100644 --- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java +++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java @@ -25,7 +25,6 @@ import org.apache.cxf.jaxrs.ext.search.SearchBean; import org.apache.cxf.jaxrs.ext.search.SearchConditionParser; import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser; import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor; -import org.apache.cxf.jaxrs.ext.search.tika.TikaLuceneContentExtractor.DocumentMetadata; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; @@ -39,6 +38,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.apache.tika.parser.pdf.PDFParser; + import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -80,7 +80,7 @@ public class TikaLuceneContentExtractorTest extends Assert { @Test public void testExtractedTextContentMatchesTypesAndSearchCriteria() throws Exception { - final DocumentMetadata documentMetadata = new DocumentMetadata("contents") + final LuceneDocumentMetadata documentMetadata = new LuceneDocumentMetadata("contents") .withField("modified", Date.class); final Document document = extractor.extract(
