Hi Chris, Yes, I made a mistake on this commit by missing a renaming file and broke build, the next commit corrected: Revision: 1633331 Author: thaichat04 Date: mardi 21 octobre 2014 11:47:54 Message: TIKA-1422 - Fixing build & minor refactory of naming test class ---- Modified : /tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Added : /tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Deleted : /tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
Please 'pull' latest again then tell me if OK ? Sorry On Tue, Oct 21, 2014 at 3:49 PM, Mattmann, Chris A (3980) < chris.a.mattm...@jpl.nasa.gov> wrote: > Hi Hong-Thai, > > These commits look strange to me - it looks like it subtracts the > whole files (and the unit test removed the test file, renamed it, > and then added what largely looks like the same file, back?) > > Any idea what¹s up? > > Cheers, > Chris > > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > Chris Mattmann, Ph.D. > Chief Architect > Instrument Software and Science Data Systems Section (398) > NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA > Office: 168-519, Mailstop: 168-527 > Email: chris.a.mattm...@nasa.gov > WWW: http://sunset.usc.edu/~mattmann/ > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > Adjunct Associate Professor, Computer Science Department > University of Southern California, Los Angeles, CA 90089 USA > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > -----Original Message----- > From: "thaicha...@apache.org" <thaicha...@apache.org> > Reply-To: "dev@tika.apache.org" <dev@tika.apache.org> > Date: Tuesday, October 21, 2014 at 2:32 AM > To: "comm...@tika.apache.org" <comm...@tika.apache.org> > Subject: svn commit: r1633325 - in /tika/trunk/tika-parsers/src: > main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java > test/java/org/apache/tika/parser/mail/RFC822ParserTest.java > > >Author: thaichat04 > >Date: Tue Oct 21 09:32:06 2014 > >New Revision: 1633325 > > > >URL: http://svn.apache.org/r1633325 > >Log: > >TIKA-1422 - Apply fix of [~olegt] in Windows > > > >Modified: > > > >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract > >OCRParser.java > > > >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa > >rserTest.java > > > >Modified: > >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract > >OCRParser.java > >URL: > > > http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apa > >che/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=1633 > >325&view=diff > >========================================================================== > >==== > >--- > >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract > >OCRParser.java (original) > >+++ > >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract > >OCRParser.java Tue Oct 21 09:32:06 2014 > >@@ -26,11 +26,11 @@ import java.io.IOException; > > import java.io.InputStream; > > import java.io.InputStreamReader; > > import java.io.Reader; > >+import java.util.ArrayList; > > import java.util.HashSet; > >+import java.util.List; > > import java.util.Map; > > import java.util.Set; > >-import java.util.List; > >-import java.util.ArrayList; > > import java.util.concurrent.Callable; > > import java.util.concurrent.ExecutionException; > > import java.util.concurrent.FutureTask; > >@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou > > import org.apache.tika.io.TikaInputStream; > > import org.apache.tika.metadata.Metadata; > > import org.apache.tika.mime.MediaType; > >-import org.apache.tika.parser.Parser; > > import org.apache.tika.parser.AbstractParser; > > import org.apache.tika.parser.ParseContext; > >+import org.apache.tika.parser.Parser; > > import org.apache.tika.parser.external.ExternalParser; > >+import org.apache.tika.parser.image.ImageParser; > >+import org.apache.tika.parser.image.PSDParser; > >+import org.apache.tika.parser.image.TiffParser; > >+import org.apache.tika.parser.jpeg.JpegParser; > > import org.apache.tika.sax.XHTMLContentHandler; > > import org.xml.sax.ContentHandler; > > import org.xml.sax.SAXException; > > > > /** > >- * TesseractOCRParser powered by tesseract-ocr engine. > >- * To enable this parser, create a {@link TesseractOCRConfig} > >- * object and pass it through a ParseContext. > >- * Tesseract-ocr must be installed and on system path or > >- * the path to its root folder must be provided: > >+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this > >parser, > >+ * create a {@link TesseractOCRConfig} object and pass it through a > >+ * ParseContext. Tesseract-ocr must be installed and on system path or > >the path > >+ * to its root folder must be provided: > > * <p> > > * TesseractOCRConfig config = new TesseractOCRConfig();<br> > > * //Needed if tesseract is not on system path<br> > >@@ -69,226 +72,231 @@ import org.xml.sax.SAXException; > > * > > */ > > public class TesseractOCRParser extends AbstractParser { > >- > >- private static final long serialVersionUID = 1L; > >- > >- private static final Set<MediaType> SUPPORTED_TYPES = getTypes(); > >- > >- private static Set<MediaType> getTypes() { > >- HashSet<MediaType> supportedTypes = new > HashSet<MediaType>(); > >- > >- supportedTypes.add(MediaType.image("png")); > >- supportedTypes.add(MediaType.image("jpeg")); > >- supportedTypes.add(MediaType.image("tiff")); > >- supportedTypes.add(MediaType.image("x-ms-bmp")); > >- supportedTypes.add(MediaType.image("gif")); > >- > >- return supportedTypes; > >- } > >- > >- @Override > >- public Set<MediaType> getSupportedTypes(ParseContext arg0) { > >- return SUPPORTED_TYPES; > >- } > >- > >- private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) { > >- if(!config.getTesseractPath().isEmpty()){ > >- Map<String, String> env = pb.environment(); > >- env.put("TESSDATA_PREFIX", config.getTesseractPath()); > >- } > >+ > >+ private static final long serialVersionUID = 1L; > >+ > >+ private static final Set<MediaType> SUPPORTED_TYPES = getTypes(); > >+ > >+ private static Set<MediaType> getTypes() { > >+ HashSet<MediaType> supportedTypes = new HashSet<MediaType>(); > >+ > >+ supportedTypes.add(MediaType.image("png")); > >+ supportedTypes.add(MediaType.image("jpeg")); > >+ supportedTypes.add(MediaType.image("tiff")); > >+ supportedTypes.add(MediaType.image("x-ms-bmp")); > >+ supportedTypes.add(MediaType.image("gif")); > >+ > >+ return supportedTypes; > >+ } > >+ > >+ @Override > >+ public Set<MediaType> getSupportedTypes(ParseContext arg0) { > >+ return SUPPORTED_TYPES; > >+ } > >+ > >+ private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) { > >+ if (!config.getTesseractPath().isEmpty()) { > >+ Map<String, String> env = pb.environment(); > >+ env.put("TESSDATA_PREFIX", config.getTesseractPath()); > > } > >- > >- public void parse(Image image, ContentHandler handler, Metadata > >metadata, ParseContext context) > >- throws IOException, SAXException, TikaException { > >- > >- TemporaryResources tmp = new TemporaryResources(); > >- FileOutputStream fos = null; > >- TikaInputStream tis = null; > >- try{ > >- int w = image.getWidth(null); > >- int h = image.getHeight(null); > >- BufferedImage bImage = new BufferedImage(w, h, > >BufferedImage.TYPE_INT_RGB); > >- Graphics2D g2 = bImage.createGraphics(); > >- g2.drawImage(image, 0, 0, null); > >- g2.dispose(); > >- File file = tmp.createTemporaryFile(); > >- fos = new FileOutputStream(file); > >- ImageIO.write(bImage, "png", fos); > >- bImage = null; > >- tis = TikaInputStream.get(file); > >- parse(tis, handler, metadata, context); > >- > >- }finally{ > >- tmp.dispose(); > >- if(tis != null) > >- tis.close(); > >- if(fos != null) > >- fos.close(); > >- } > >- > >- > >- } > >- > >- @Override > >- public void parse( > >- InputStream stream, ContentHandler handler, > >- Metadata metadata, ParseContext context) > >- throws IOException, SAXException, TikaException { > >- > >- TesseractOCRConfig config = context.get(TesseractOCRConfig.class); > >- if(config == null) config = new TesseractOCRConfig(); > >- > >- String[] checkCmd = {config.getTesseractPath() + "tesseract"}; > >- // If Tesseract is not on the path, do not try to run OCR. > >- if (!ExternalParser.check(checkCmd)) return; > >- > >- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, > >metadata); > >+ } > > > >- TemporaryResources tmp = new TemporaryResources(); > >- File output = null; > >- try { > >- TikaInputStream tikaStream = TikaInputStream.get(stream, > tmp); > >- File input = tikaStream.getFile(); > >- long size = tikaStream.getLength(); > >- > >- if(size >= config.getMinFileSizeToOcr() && size <= > >config.getMaxFileSizeToOcr()){ > >- > >- output = tmp.createTemporaryFile(); > >- doOCR(input, output, config); > >- > >- //Tesseract appends .txt to output file name > >- output = new File(output.getAbsolutePath() + ".txt"); > >- > >- if(output.exists()) > >- extractOutput(new FileInputStream(output), xhtml); > >+ public void parse(Image image, ContentHandler handler, Metadata > >metadata, ParseContext context) throws IOException, > >+ SAXException, TikaException { > > > >- } > >- > >- } finally { > >- tmp.dispose(); > >- if(output != null) > >- output.delete(); > >- > >- } > >+ TemporaryResources tmp = new TemporaryResources(); > >+ FileOutputStream fos = null; > >+ TikaInputStream tis = null; > >+ try { > >+ int w = image.getWidth(null); > >+ int h = image.getHeight(null); > >+ BufferedImage bImage = new BufferedImage(w, h, > >BufferedImage.TYPE_INT_RGB); > >+ Graphics2D g2 = bImage.createGraphics(); > >+ g2.drawImage(image, 0, 0, null); > >+ g2.dispose(); > >+ File file = tmp.createTemporaryFile(); > >+ fos = new FileOutputStream(file); > >+ ImageIO.write(bImage, "png", fos); > >+ bImage = null; > >+ tis = TikaInputStream.get(file); > >+ parse(tis, handler, metadata, context); > >+ > >+ } finally { > >+ tmp.dispose(); > >+ if (tis != null) > >+ tis.close(); > >+ if (fos != null) > >+ fos.close(); > > } > > > >- /** > >- * Run external tesseract-ocr process. > >- * @param input File to be ocred > >- * @param output File to collect ocr result > >- * @param config Configuration of tesseract-ocr engine > >- * @throws TikaException if the extraction timed out > >- * @throws IOException if an input error occurred > >- */ > >- private void doOCR(File input, File output, TesseractOCRConfig > >config) > >- throws IOException, TikaException { > >- String[] cmd = {config.getTesseractPath() + "tesseract", > >- input.getPath(), > >- output.getPath() , > >- "-l", > >- config.getLanguage() , > >- "-psm", > >- config.getPageSegMode() }; > >- > >- ProcessBuilder pb = new ProcessBuilder(cmd); > >- setEnv(config, pb); > >- final Process process = pb.start(); > >- > >- process.getOutputStream().close(); > >- InputStream out = process.getInputStream(); > >- InputStream err = process.getErrorStream(); > >- > >- logStream("OCR MSG", out, input); > >- logStream("OCR ERROR", err, input); > >- > >- FutureTask<Integer> waitTask = new FutureTask<Integer>(new > >Callable<Integer>() { > >- public Integer call() throws Exception { > >- return process.waitFor(); > >- } > >- }); > >- > >- Thread waitThread = new Thread(waitTask); > >- waitThread.start(); > >- > >- try { > >- waitTask.get(config.getTimeout(), TimeUnit.SECONDS); > >- > >- } catch (InterruptedException e) { > >- waitThread.interrupt(); > >- process.destroy(); > >- Thread.currentThread().interrupt(); > >- throw new TikaException("TesseractOCRParser interrupted", > e); > >- > >- } catch (ExecutionException e) { > >- //should not be thrown > >- > >- } catch (TimeoutException e) { > >- waitThread.interrupt(); > >- process.destroy(); > >- throw new TikaException("TesseractOCRParser > timeout", e); > >- } > >- > >- > >+ } > >+ > >+ @Override > >+ public void parse(InputStream stream, ContentHandler handler, Metadata > >metadata, ParseContext context) > >+ throws IOException, SAXException, TikaException { > >+ > >+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class); > >+ if (config == null) > >+ config = new TesseractOCRConfig(); > >+ > >+ String[] checkCmd = { config.getTesseractPath() + "tesseract" }; > >+ // If Tesseract is not on the path, do not try to run OCR. > >+ if (!ExternalParser.check(checkCmd)) > >+ return; > >+ > >+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, > >metadata); > >+ > >+ TemporaryResources tmp = new TemporaryResources(); > >+ File output = null; > >+ try { > >+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); > >+ File input = tikaStream.getFile(); > >+ long size = tikaStream.getLength(); > >+ > >+ if (size >= config.getMinFileSizeToOcr() && size <= > >config.getMaxFileSizeToOcr()) { > >+ > >+ output = tmp.createTemporaryFile(); > >+ doOCR(input, output, config); > >+ > >+ // Tesseract appends .txt to output file name > >+ output = new File(output.getAbsolutePath() + ".txt"); > >+ > >+ if (output.exists()) > >+ extractOutput(new FileInputStream(output), xhtml); > >+ > >+ } > >+ > >+ } finally { > >+ tmp.dispose(); > >+ if (output != null) > >+ output.delete(); > >+ > > } > >- > >+ } > > > >- /** > >- * Reads the contents of the given stream and write it to the > >- * given XHTML content handler. > >- * The stream is closed once fully processed. > >- * > >- * @param stream Stream where is the result of ocr > >- * @param xhtml XHTML content handler > >- * @throws SAXException if the XHTML SAX events could not be handled > >- * @throws IOException if an input error occurred > >- */ > >- private void extractOutput(InputStream stream, XHTMLContentHandler > >xhtml) > >- throws SAXException, IOException { > >- > >- Reader reader = new InputStreamReader(stream, "UTF-8"); > >- xhtml.startDocument(); > >- xhtml.startElement("div"); > >- try { > >- char[] buffer = new char[1024]; > >- for (int n = reader.read(buffer); n != -1; n = > >reader.read(buffer)) { > >- if (n > 0) xhtml.characters(buffer, 0, n); > >- } > >- } finally { > >- reader.close(); > >- } > >- xhtml.endElement("div"); > >- xhtml.endDocument(); > >+ /** > >+ * Run external tesseract-ocr process. > >+ * > >+ * @param input > >+ * File to be ocred > >+ * @param output > >+ * File to collect ocr result > >+ * @param config > >+ * Configuration of tesseract-ocr engine > >+ * @throws TikaException > >+ * if the extraction timed out > >+ * @throws IOException > >+ * if an input error occurred > >+ */ > >+ private void doOCR(File input, File output, TesseractOCRConfig config) > >throws IOException, TikaException { > >+ String[] cmd = { config.getTesseractPath() + "tesseract", > >input.getPath(), output.getPath(), "-l", > >+ config.getLanguage(), "-psm", config.getPageSegMode() }; > >+ > >+ ProcessBuilder pb = new ProcessBuilder(cmd); > >+ setEnv(config, pb); > >+ final Process process = pb.start(); > >+ > >+ process.getOutputStream().close(); > >+ InputStream out = process.getInputStream(); > >+ InputStream err = process.getErrorStream(); > >+ > >+ logStream("OCR MSG", out, input); > >+ logStream("OCR ERROR", err, input); > >+ > >+ FutureTask<Integer> waitTask = new FutureTask<Integer>(new > >Callable<Integer>() { > >+ public Integer call() throws Exception { > >+ return process.waitFor(); > >+ } > >+ }); > >+ > >+ Thread waitThread = new Thread(waitTask); > >+ waitThread.start(); > >+ > >+ try { > >+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS); > >+ > >+ } catch (InterruptedException e) { > >+ waitThread.interrupt(); > >+ process.destroy(); > >+ Thread.currentThread().interrupt(); > >+ throw new TikaException("TesseractOCRParser interrupted", e); > >+ > >+ } catch (ExecutionException e) { > >+ // should not be thrown > >+ > >+ } catch (TimeoutException e) { > >+ waitThread.interrupt(); > >+ process.destroy(); > >+ throw new TikaException("TesseractOCRParser timeout", e); > > } > > > >- /** > >- * Starts a thread that reads the contents of the standard output > >- * or error stream of the given process to not block the process. > >- * The stream is closed once fully processed. > >- */ > >- private void logStream(final String logType, final InputStream > >stream, final File file) { > >- new Thread() { > >- public void run() { > >- Reader reader = new InputStreamReader(stream); > >- StringBuilder out = new StringBuilder(); > >- char[] buffer = new char[1024]; > >- try { > >- for (int n = reader.read(buffer); > n != -1; n = reader.read(buffer)) > >- out.append(buffer, 0, n); > >- } catch (IOException e) { > >- > >- } finally { > >- IOUtils.closeQuietly(stream); > >- } > >- > >- > >- String msg = out.toString(); > >- //log or discard message? > >- > >- } > >- }.start(); > >+ } > >+ > >+ /** > >+ * Reads the contents of the given stream and write it to the given > >XHTML > >+ * content handler. The stream is closed once fully processed. > >+ * > >+ * @param stream > >+ * Stream where is the result of ocr > >+ * @param xhtml > >+ * XHTML content handler > >+ * @throws SAXException > >+ * if the XHTML SAX events could not be handled > >+ * @throws IOException > >+ * if an input error occurred > >+ */ > >+ private void extractOutput(InputStream stream, XHTMLContentHandler > >xhtml) throws SAXException, IOException { > >+ > >+ Reader reader = new InputStreamReader(stream, "UTF-8"); > >+ xhtml.startDocument(); > >+ xhtml.startElement("div"); > >+ try { > >+ char[] buffer = new char[1024]; > >+ for (int n = reader.read(buffer); n != -1; n = > >reader.read(buffer)) { > >+ if (n > 0) > >+ xhtml.characters(buffer, 0, n); > >+ } > >+ } finally { > >+ reader.close(); > > } > >+ xhtml.endElement("div"); > >+ xhtml.endDocument(); > >+ } > >+ > >+ /** > >+ * Starts a thread that reads the contents of the standard output or > >error > >+ * stream of the given process to not block the process. The stream is > >closed > >+ * once fully processed. > >+ */ > >+ private void logStream(final String logType, final InputStream stream, > >final File file) { > >+ new Thread() { > >+ public void run() { > >+ Reader reader = new InputStreamReader(stream); > >+ StringBuilder out = new StringBuilder(); > >+ char[] buffer = new char[1024]; > >+ try { > >+ for (int n = reader.read(buffer); n != -1; n = > >reader.read(buffer)) > >+ out.append(buffer, 0, n); > >+ } catch (IOException e) { > > > >- > >-} > >+ } finally { > >+ IOUtils.closeQuietly(stream); > >+ } > > > >+ String msg = out.toString(); > >+ // log or discard message? > > > >+ } > >+ }.start(); > >+ } > >+ > >+ private List<Parser> getImageParsers() { > >+ List<Parser> parsers = new ArrayList<Parser>(); > >+ parsers.add(new ImageParser()); > >+ parsers.add(new PSDParser()); > >+ parsers.add(new TiffParser()); > >+ parsers.add(new JpegParser()); > >+ return parsers; > >+ } > >+ > >+} > > > >Modified: > >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa > >rserTest.java > >URL: > > > http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apa > >che/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=16333 > >25&view=diff > >========================================================================== > >==== > >--- > >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa > >rserTest.java (original) > >+++ > >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa > >rserTest.java Tue Oct 21 09:32:06 2014 > >@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata > > import org.apache.tika.metadata.TikaCoreProperties; > > import org.apache.tika.parser.ParseContext; > > import org.apache.tika.parser.Parser; > >+import org.apache.tika.parser.ocr.TesseractOCRConfig; > >+import org.apache.tika.parser.ocr.TesseractOCRParserTest; > > import org.apache.tika.sax.BodyContentHandler; > > import org.apache.tika.sax.XHTMLContentHandler; > > import org.junit.Test; > >@@ -83,13 +85,19 @@ public class RFC822ParserTest { > > try { > > parser.parse(stream, handler, metadata, new ParseContext()); > > verify(handler).startDocument(); > >- //4 body-part divs -- two outer bodies and two inner bodies > >- verify(handler, > >times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), > >eq("div"), any(Attributes.class)); > >- verify(handler, > >times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div"); > >- //5 paragraph elements, 4 for body-parts and 1 for > >encompassing message > >- verify(handler, > >times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), > >any(Attributes.class)); > >- verify(handler, > >times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p"); > >+ int bodyExpectedTimes = 4, multipackExpectedTimes = 5;; > >+ int invokingTimes = bodyExpectedTimes; > >+ TesseractOCRConfig config = new TesseractOCRConfig(); > >+ if (TesseractOCRParserTest.canRun(config)) { > >+ invokingTimes = multipackExpectedTimes; > >+ } > >+ > >+ verify(handler, > >times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML), > >eq("div"), eq("div"), any(Attributes.class)); > >+ verify(handler, > >times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div"); > >+ verify(handler, > >times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), > > eq("p"), eq("p"), any(Attributes.class)); > >+ verify(handler, > >times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", > >"p"); > > verify(handler).endDocument(); > >+ > > } catch (Exception e) { > > fail("Exception thrown: " + e.getMessage()); > > } > > > > > > -- -------------- Hong-Thai