Author: nick Date: Thu Aug 20 10:01:13 2015 New Revision: 1696748 URL: http://svn.apache.org/r1696748 Log: TIKA-1710 patch from Yaniv Kunda - Use Commons IO instead of the Tika Core IO copies, and java.nio.charset.StandardCharsets
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1696748&r1=1696747&r2=1696748&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Thu Aug 20 10:01:13 2015 @@ -52,6 +52,9 @@ import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.log4j.Level; @@ -72,9 +75,6 @@ import org.apache.tika.exception.TikaExc import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.fork.ForkParser; import org.apache.tika.gui.TikaGUI; -import org.apache.tika.io.CloseShieldInputStream; -import org.apache.tika.io.FilenameUtils; -import org.apache.tika.io.IOUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.language.LanguageProfilerBuilder; import org.apache.tika.language.ProfilingHandler; @@ -106,6 +106,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Simple command line interface for Apache Tika. */ @@ -855,7 +857,7 @@ public class TikaCLI { for (File mf : dir.listFiles()) { if (mf.isFile()) { BufferedReader r = new BufferedReader(new InputStreamReader( - new FileInputStream(mf), IOUtils.UTF_8)); + new FileInputStream(mf), UTF_8)); String line; while ((line = r.readLine()) != null) { if (line.startsWith("!:mime") || @@ -969,7 +971,7 @@ public class TikaCLI { } else if (System.getProperty("os.name") .toLowerCase(Locale.ROOT).startsWith("mac os x")) { // TIKA-324: Override the default encoding on Mac OS X - return new OutputStreamWriter(output, IOUtils.UTF_8); + return new OutputStreamWriter(output, UTF_8); } else { return new OutputStreamWriter(output, Charset.defaultCharset()); } Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1696748&r1=1696747&r2=1696748&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Aug 20 10:01:13 2015 @@ -47,10 +47,6 @@ import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.event.KeyEvent; import java.awt.event.WindowEvent; -import java.awt.event.ActionEvent; -import java.awt.event.ActionListener; -import java.awt.event.KeyEvent; -import java.awt.event.WindowEvent; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -65,10 +61,10 @@ import java.util.HashMap; import java.util.Map; import java.util.Set; +import org.apache.commons.io.IOUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DocumentSelector; -import org.apache.tika.io.IOUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadataList; @@ -91,6 +87,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Simple Swing GUI for Apache Tika. You can drag and drop files on top * of the window to have them parsed. @@ -481,11 +479,8 @@ public class TikaGUI extends JFrame URL url = e.getURL(); InputStream stream = url.openStream(); try { - StringWriter writer = new StringWriter(); - IOUtils.copy(stream, writer, IOUtils.UTF_8.name()); - JEditorPane editor = - new JEditorPane("text/plain", writer.toString()); + new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8)); editor.setEditable(false); editor.setBackground(Color.WHITE); editor.setCaretPosition(0); Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java?rev=1696748&r1=1696747&r2=1696748&view=diff ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java (original) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java Thu Aug 20 10:01:13 2015 @@ -16,6 +16,7 @@ */ package org.apache.tika.cli; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -27,7 +28,7 @@ import java.util.LinkedHashMap; import java.util.Map; import org.apache.commons.io.FileUtils; -import org.apache.tika.io.IOUtils; +import org.apache.commons.io.IOUtils; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -47,7 +48,7 @@ public class TikaCLIBatchCommandLineTest OutputStream os = null; try { os = new FileOutputStream(testFile); - IOUtils.write("test output", os, "UTF-8"); + IOUtils.write("test output", os, UTF_8); } catch (IOException e) { throw new RuntimeException("Couldn't open testFile"); } finally { Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1696748&r1=1696747&r2=1696748&view=diff ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java (original) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java Thu Aug 20 10:01:13 2015 @@ -17,6 +17,7 @@ package org.apache.tika.cli; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @@ -31,7 +32,7 @@ import java.io.Reader; import java.util.List; import org.apache.commons.io.FileUtils; -import org.apache.tika.io.IOUtils; +import org.apache.commons.io.IOUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.parser.RecursiveParserWrapper; @@ -54,9 +55,9 @@ public class TikaCLIBatchIntegrationTest tempDir.delete(); tempDir.mkdir(); outBuffer = new ByteArrayOutputStream(); - PrintStream outWriter = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); + PrintStream outWriter = new PrintStream(outBuffer, true, UTF_8.name()); ByteArrayOutputStream errBuffer = new ByteArrayOutputStream(); - PrintStream errWriter = new PrintStream(errBuffer, true, IOUtils.UTF_8.name()); + PrintStream errWriter = new PrintStream(errBuffer, true, UTF_8.name()); out = System.out; err = System.err; System.setOut(outWriter); @@ -65,8 +66,8 @@ public class TikaCLIBatchIntegrationTest @After public void tearDown() throws Exception { - System.setOut(new PrintStream(out, true, IOUtils.UTF_8.name())); - System.setErr(new PrintStream(err, true, IOUtils.UTF_8.name())); + System.setOut(new PrintStream(out, true, UTF_8.name())); + System.setErr(new PrintStream(err, true, UTF_8.name())); FileUtils.deleteDirectory(tempDir); } @@ -104,7 +105,7 @@ public class TikaCLIBatchIntegrationTest }; TikaCLI.main(params); reader = new InputStreamReader( - new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8); + new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events")); @@ -123,7 +124,7 @@ public class TikaCLIBatchIntegrationTest assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); - String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8); + String sysOutString = new String(outBuffer.toByteArray(), UTF_8); assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG")); } @@ -139,7 +140,7 @@ public class TikaCLIBatchIntegrationTest }; TikaCLI.main(params); reader = new InputStreamReader( - new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8); + new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertEquals("59f626e09a8c16ab6dbc2800c685f772", metadataList.get(0).get("X-TIKA:digest:MD5")); @@ -159,7 +160,7 @@ public class TikaCLIBatchIntegrationTest }; TikaCLI.main(params); reader = new InputStreamReader( - new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8); + new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), UTF_8); List<Metadata> metadataList = JsonMetadataList.fromJson(reader); assertEquals(12, metadataList.size()); assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512")); Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1696748&r1=1696747&r2=1696748&view=diff ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Thu Aug 20 10:01:13 2015 @@ -16,6 +16,7 @@ */ package org.apache.tika.cli; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -26,7 +27,6 @@ import java.net.URI; import org.apache.commons.io.FileUtils; import org.apache.tika.exception.TikaException; -import org.apache.tika.io.IOUtils; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -50,7 +50,7 @@ public class TikaCLITest { outContent = new ByteArrayOutputStream(); resourcePrefix = testDataURI.toString(); stdout = System.out; - System.setOut(new PrintStream(outContent, true, IOUtils.UTF_8.name())); + System.setOut(new PrintStream(outContent, true, UTF_8.name())); } /** @@ -74,7 +74,7 @@ public class TikaCLITest { public void testListParserDetail() throws Exception{ String[] params = {"--list-parser-detail"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web")); + assertTrue(outContent.toString(UTF_8.name()).contains("application/vnd.oasis.opendocument.text-web")); } /** @@ -99,11 +99,11 @@ public class TikaCLITest { public void testXMLOutput() throws Exception{ String[] params = {"-x", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?")); + assertTrue(outContent.toString(UTF_8.name()).contains("?xml version=\"1.0\" encoding=\"UTF-8\"?")); params = new String[]{"-x", "--digest=SHA256", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()) + assertTrue(outContent.toString(UTF_8.name()) .contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee")); } @@ -119,7 +119,7 @@ public class TikaCLITest { TikaCLI.main(params); assertTrue(outContent.toString("UTF-8").contains("html xmlns=\"http://www.w3.org/1999/xhtml")); assertTrue("Expanded <title></title> element should be present", - outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>")); + outContent.toString(UTF_8.name()).contains("<title></title>")); params = new String[]{"-h", "--digest=SHA384", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); @@ -136,7 +136,7 @@ public class TikaCLITest { public void testTextOutput() throws Exception{ String[] params = {"-t", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("finished off the cake")); + assertTrue(outContent.toString(UTF_8.name()).contains("finished off the cake")); } /** @@ -147,12 +147,12 @@ public class TikaCLITest { public void testMetadataOutput() throws Exception{ String[] params = {"-m", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain")); + assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); params = new String[]{"-m", "--digest=SHA512", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain")); - assertTrue(outContent.toString(IOUtils.UTF_8.name()) + assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); + assertTrue(outContent.toString(UTF_8.name()) .contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0")); } @@ -165,7 +165,7 @@ public class TikaCLITest { public void testJsonMetadataOutput() throws Exception { String[] params = {"--json", "--digest=MD2", resourcePrefix + "testJsonMultipleInts.html"}; TikaCLI.main(params); - String json = outContent.toString(IOUtils.UTF_8.name()); + String json = outContent.toString(UTF_8.name()); //TIKA-1310 assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\",")); @@ -187,7 +187,7 @@ public class TikaCLITest { public void testJsonMetadataPrettyPrintOutput() throws Exception { String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"}; TikaCLI.main(params); - String json = outContent.toString(IOUtils.UTF_8.name()); + String json = outContent.toString(UTF_8.name()); assertTrue(json.contains(" \"X-Parsed-By\": [\n" + " \"org.apache.tika.parser.DefaultParser\",\n" + @@ -210,7 +210,7 @@ public class TikaCLITest { public void testLanguageOutput() throws Exception{ String[] params = {"-l", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("en")); + assertTrue(outContent.toString(UTF_8.name()).contains("en")); } /** @@ -222,7 +222,7 @@ public class TikaCLITest { public void testDetectOutput() throws Exception{ String[] params = {"-d", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain")); + assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); } /** @@ -234,7 +234,7 @@ public class TikaCLITest { public void testListMetModels() throws Exception{ String[] params = {"--list-met-models", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain")); + assertTrue(outContent.toString(UTF_8.name()).contains("text/plain")); } /** @@ -246,7 +246,7 @@ public class TikaCLITest { public void testListSupportedTypes() throws Exception{ String[] params = {"--list-supported-types", resourcePrefix + "alice.cli.test"}; TikaCLI.main(params); - assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("supertype: application/octet-stream")); + assertTrue(outContent.toString(UTF_8.name()).contains("supertype: application/octet-stream")); } /** @@ -319,7 +319,7 @@ public class TikaCLITest { public void testMultiValuedMetadata() throws Exception { String[] params = {"-m", resourcePrefix + "testMultipleSheets.numbers"}; TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("sheetNames: Checking")); assertTrue(content.contains("sheetNames: Secon sheet")); assertTrue(content.contains("sheetNames: Logical Sheet 3")); @@ -333,7 +333,7 @@ public class TikaCLITest { new File("subdir/foo.txt").delete(); new File("subdir").delete(); TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("Extracting 'subdir/foo.txt'")); // clean up. TODO: These should be in target. new File("target/subdir/foo.txt").delete(); @@ -359,7 +359,7 @@ public class TikaCLITest { public void testConfig() throws Exception { String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", resourcePrefix+"bad_xml.xml"}; TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("apple")); assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser")); } @@ -368,7 +368,7 @@ public class TikaCLITest { public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception { String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"}; TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("[\n" + " {\n" + " \"Application-Name\": \"Microsoft Office Word\",\n" + @@ -384,7 +384,7 @@ public class TikaCLITest { public void testJsonRecursiveMetadataParserDefault() throws Exception { String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"}; TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml")); } @@ -392,7 +392,7 @@ public class TikaCLITest { public void testJsonRecursiveMetadataParserText() throws Exception { String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"}; TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("\\n\\nembed_4\\n")); assertTrue(content.contains("\\n\\nembed_0")); } @@ -401,7 +401,7 @@ public class TikaCLITest { public void testDigestInJson() throws Exception { String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5", resourcePrefix+"test_recursive_embedded.docx"}; TikaCLI.main(params); - String content = outContent.toString(IOUtils.UTF_8.name()); + String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"59f626e09a8c16ab6dbc2800c685f772\",")); assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\"")); }