Author: dmeikle
Date: Wed Nov 19 12:44:41 2014
New Revision: 1640535

URL: http://svn.apache.org/r1640535
Log:
TIKA-1477: Added new custom header to Tika resource override Tesseract OCR 
language

Modified:
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1640535&r1=1640534&r2=1640535&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java 
(original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java 
Wed Nov 19 12:44:41 2014
@@ -17,35 +17,6 @@
 
 package org.apache.tika.server;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-
-import javax.mail.internet.ContentDisposition;
-import javax.mail.internet.ParseException;
-import javax.ws.rs.Consumes;
-import javax.ws.rs.GET;
-import javax.ws.rs.PUT;
-import javax.ws.rs.Path;
-import javax.ws.rs.Produces;
-import javax.ws.rs.WebApplicationException;
-import javax.ws.rs.core.Context;
-import javax.ws.rs.core.HttpHeaders;
-import javax.ws.rs.core.MultivaluedMap;
-import javax.ws.rs.core.Response;
-import javax.ws.rs.core.StreamingOutput;
-import javax.ws.rs.core.UriInfo;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
@@ -63,14 +34,44 @@ import org.apache.tika.parser.AutoDetect
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import javax.mail.internet.ContentDisposition;
+import javax.mail.internet.ParseException;
+import javax.ws.rs.Consumes;
+import javax.ws.rs.GET;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MultivaluedMap;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.StreamingOutput;
+import javax.ws.rs.core.UriInfo;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
 @Path("/tika")
 public class TikaResource {
   public static final String GREETING = "This is Tika Server. Please PUT\n";
+  public static final String X_TIKA_OCR_LANGUAGE_HEADER = "X-Tika-OCRLanguage";
   private final Log logger = LogFactory.getLog(TikaResource.class);
   
   private TikaConfig tikaConfig;
@@ -132,6 +133,18 @@ public class TikaResource {
     return httpHeaders.getFirst("File-Name");
   }
 
+  public static void fillParseContext(ParseContext parseContext, 
MultivaluedMap<String, String> httpHeaders) {
+    String language = httpHeaders.getFirst(X_TIKA_OCR_LANGUAGE_HEADER);
+    if (language != null) {
+      if (!language.matches("([A-Za-z](\\+?))*")) {
+        throw new WebApplicationException(String.format("Invalid %s format", 
X_TIKA_OCR_LANGUAGE_HEADER));
+      }
+      TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+      ocrConfig.setLanguage(language);
+      parseContext.set(TesseractOCRConfig.class, ocrConfig);
+    }
+  }
+
   @SuppressWarnings("serial")
 public static void fillMetadata(AutoDetectParser parser, Metadata metadata, 
MultivaluedMap<String, String> httpHeaders) {
     String fileName = detectFilename(httpHeaders);
@@ -186,8 +199,10 @@ public static void fillMetadata(AutoDete
   public StreamingOutput produceText(final InputStream is, 
MultivaluedMap<String, String> httpHeaders, final UriInfo info) {     
     final AutoDetectParser parser = createParser(tikaConfig);
     final Metadata metadata = new Metadata();
+    final ParseContext context = new ParseContext();
 
     fillMetadata(parser, metadata, httpHeaders);
+    fillParseContext(context, httpHeaders);
 
     logRequest(logger, info, metadata);
 
@@ -200,7 +215,7 @@ public static void fillMetadata(AutoDete
         TikaInputStream tis = TikaInputStream.get(is);
 
         try {
-            parser.parse(tis, body, metadata);
+            parser.parse(tis, body, metadata, context);
         } catch (SAXException e) {
           throw new WebApplicationException(e);
         } catch (EncryptedDocumentException e) {
@@ -272,8 +287,11 @@ public static void fillMetadata(AutoDete
         final UriInfo info, final String format) {
     final AutoDetectParser parser = createParser(tikaConfig);
     final Metadata metadata = new Metadata();
+    final ParseContext context = new ParseContext();
 
     fillMetadata(parser, metadata, httpHeaders);
+    fillParseContext(context, httpHeaders);
+
 
     logRequest(logger, info, metadata);
 
@@ -299,7 +317,7 @@ public static void fillMetadata(AutoDete
         TikaInputStream tis = TikaInputStream.get(is);
 
         try {
-          parser.parse(tis, content, metadata);
+          parser.parse(tis, content, metadata, context);
         }
         catch (SAXException e) {
           throw new WebApplicationException(e);


Reply via email to