Author: dmeikle
Date: Thu Nov 20 10:32:30 2014
New Revision: 1640714

URL: http://svn.apache.org/r1640714
Log:
TIKA-1477: Updated Tika resource to dynamically set TesseractOCRConfig and 
PDFParserConfig files from custom headers

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1640714&r1=1640713&r2=1640714&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 Thu Nov 20 10:32:30 2014
@@ -135,6 +135,9 @@ public class TesseractOCRConfig implemen
         * Multiple languages may be specified, separated by plus characters.
         */
        public void setLanguage(String language) {
+               if (!language.matches("([A-Za-z](\\+?))*")) {
+                       throw new IllegalArgumentException("Invalid language 
code");
+               }
                this.language = language;
        }
        
@@ -148,6 +151,9 @@ public class TesseractOCRConfig implemen
         * Default is 1 = Automatic page segmentation with OSD (Orientation and 
Script Detection)
         */
        public void setPageSegMode(String pageSegMode) {
+               if (!pageSegMode.matches("[1-9]|10")) {
+                       throw new IllegalArgumentException("Invalid language 
code");
+               }
                this.pageSegMode = pageSegMode;
        }
        

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1640714&r1=1640713&r2=1640714&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
 Thu Nov 20 10:32:30 2014
@@ -23,6 +23,8 @@ import java.io.File;
 import java.io.InputStream;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 public class TesseractOCRConfigTest extends TikaTest {
 
@@ -67,4 +69,22 @@ public class TesseractOCRConfigTest exte
         assertEquals("Invalid overridden timeout value", 240, 
config.getTimeout());
     }
 
+    @Test(expected=IllegalArgumentException.class)
+    public void testValidateLanguage() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setLanguage("eng");
+        config.setLanguage("eng+fra");
+        assertTrue("Couldn't set valid values", true);
+        config.setLanguage("rm -Rf *");
+    }
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testValidatePageSegMode() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setPageSegMode("0");
+        config.setPageSegMode("10");
+        assertTrue("Couldn't set valid values", true);
+        config.setPageSegMode("11");
+    }
+
 }

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1640714&r1=1640713&r2=1640714&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java 
(original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java 
Thu Nov 20 10:32:30 2014
@@ -17,29 +17,6 @@
 
 package org.apache.tika.server;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.poi.extractor.ExtractorFactory;
-import org.apache.poi.hwpf.OldWordFileFormatException;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.parser.ocr.TesseractOCRConfig;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ExpandedTitleContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
 import javax.mail.internet.ContentDisposition;
 import javax.mail.internet.ParseException;
 import javax.ws.rs.Consumes;
@@ -64,14 +41,42 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.lang.reflect.Field;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ExpandedTitleContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 @Path("/tika")
 public class TikaResource {
   public static final String GREETING = "This is Tika Server. Please PUT\n";
-  public static final String X_TIKA_OCR_LANGUAGE_HEADER = "X-Tika-OCRLanguage";
+  public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR";
+  public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF";
+
+
   private final Log logger = LogFactory.getLog(TikaResource.class);
   
   private TikaConfig tikaConfig;
@@ -134,14 +139,44 @@ public class TikaResource {
   }
 
   public static void fillParseContext(ParseContext parseContext, 
MultivaluedMap<String, String> httpHeaders) {
-    String language = httpHeaders.getFirst(X_TIKA_OCR_LANGUAGE_HEADER);
-    if (language != null) {
-      if (!language.matches("([A-Za-z](\\+?))*")) {
-        throw new WebApplicationException(String.format("Invalid %s format", 
X_TIKA_OCR_LANGUAGE_HEADER));
+    TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+    PDFParserConfig pdfParserConfig = new PDFParserConfig();
+    for (String key : httpHeaders.keySet()) {
+      if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+          processHeaderConfig(httpHeaders, ocrConfig, key, 
X_TIKA_OCR_HEADER_PREFIX);
+      } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+        processHeaderConfig(httpHeaders, pdfParserConfig, key, 
X_TIKA_PDF_HEADER_PREFIX);
+      }
+    }
+    parseContext.set(TesseractOCRConfig.class, ocrConfig);
+    parseContext.set(PDFParserConfig.class, pdfParserConfig);
+  }
+
+  /**
+   * Utility method to set a property on a class via reflection.
+   *
+   * @param httpHeaders the HTTP headers set.
+   * @param object the <code>Object</code> to set the property on.
+   * @param key the key of the HTTP Header.
+   * @param prefix the name of the HTTP Header prefix used to find property.
+   * @throws WebApplicationException thrown when field cannot be found.
+   */
+  private static void processHeaderConfig(MultivaluedMap<String, String> 
httpHeaders, Object object, String key, String prefix) {
+    try {
+      String property = StringUtils.removeStart(key, prefix);
+      Field field = 
object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
+      field.setAccessible(true);
+      if (field.getType() == String.class) {
+        field.set(object, httpHeaders.getFirst(key));
+      } else if (field.getType() == int.class) {
+        field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key)));
+      } else if (field.getType() == double.class) {
+        field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key)));
+      } else if (field.getType() == boolean.class) {
+        field.setBoolean(object, 
Boolean.parseBoolean(httpHeaders.getFirst(key)));
       }
-      TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
-      ocrConfig.setLanguage(language);
-      parseContext.set(TesseractOCRConfig.class, ocrConfig);
+    } catch (Throwable ex) {
+      throw new WebApplicationException(String.format("%s is an invalid %s 
header", key, X_TIKA_OCR_HEADER_PREFIX));
     }
   }
 


Reply via email to