Author: grossws Date: Mon Sep 21 17:20:36 2015 New Revision: 1704370 URL: http://svn.apache.org/viewvc?rev=1704370&view=rev Log: Add interruptable parsing example
See also http://stackoverflow.com/questions/31939851 Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java?rev=1704370&view=auto ============================================================================== --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java (added) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java Mon Sep 21 17:20:36 2015 @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.example; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This example demonstrates how to interrupt document parsing if + * some condition is met. + * <p> + * {@link InterruptingContentHandler} throws special exception as soon as + * find {@code query} string in parsed file. + * + * See also http://stackoverflow.com/questions/31939851 + */ +public class InterruptableParsingExample { + private Tika tika = new Tika(); // for default autodetect parser + + public boolean findInFile(String query, Path path) { + InterruptingContentHandler handler = new InterruptingContentHandler(query); + + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + context.set(Parser.class, tika.getParser()); + + try (InputStream is = new BufferedInputStream(Files.newInputStream(path))) { + tika.getParser().parse(is, handler, metadata, context); + } catch (QueryMatchedException e) { + return true; + } catch (SAXException | TikaException | IOException e) { + // something went wrong with parsing... + e.printStackTrace(); + } + return false; + } + + class QueryMatchedException extends SAXException { + } + + /** + * Trivial content handler that searched for {@code query} in characters send to it. + * <p> + * Throws {@link QueryMatchedException} when query string is found. + */ + class InterruptingContentHandler extends DefaultHandler { + private String query; + private StringBuilder sb = new StringBuilder(); + + InterruptingContentHandler(String query) { + this.query = query; + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + sb.append(new String(ch, start, length).toLowerCase()); + + if (sb.toString().contains(query)) + throw new QueryMatchedException(); + + if (sb.length() > 2 * query.length()) + sb.delete(0, sb.length() - query.length()); // keep tail with query.length() chars + } + } +}
