This is an automated email from the ASF dual-hosted git repository. mkataria pushed a commit to branch OAK-10790 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 35f7d3fb8f453c498a4594abcec8f6127b28021f Author: Mohit Kataria <[email protected]> AuthorDate: Wed Jul 30 11:51:46 2025 +0530 OAK-10790: FullTextBinaryTextExtractor fails to extract text from csv --- oak-it-osgi/pom.xml | 1 + .../jackrabbit/oak/osgi/TikaExtractionOsgiIT.java | 23 ++++++++++++++++++---- .../org/apache/jackrabbit/oak/osgi/test2.txt | 4 ++++ oak-it-osgi/src/test/resources/versions.properties | 1 + oak-parent/pom.xml | 5 +++++ oak-run-elastic/pom.xml | 3 ++- 6 files changed, 32 insertions(+), 5 deletions(-) diff --git a/oak-it-osgi/pom.xml b/oak-it-osgi/pom.xml index 02f296285e..0bdf1dfb3f 100644 --- a/oak-it-osgi/pom.xml +++ b/oak-it-osgi/pom.xml @@ -82,6 +82,7 @@ <configuration> <excludes> <exclude>**/test.txt</exclude> + <exclude>**/test2.txt</exclude> <exclude>**/test.rtf</exclude> <exclude>**/test.doc</exclude> <exclude>**/test.docx</exclude> diff --git a/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java b/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java index bf4234d750..faedf80ca1 100644 --- a/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java +++ b/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java @@ -64,11 +64,17 @@ public class TikaExtractionOsgiIT { private static final String COMPRESS_VERSION = "commons-compress"; private static final String LANG3_VERSION = "commons-lang3"; private static final String MATH3_VERSION = "commons-math3"; + private static final String COMMONS_CSV_VERSION = "commons-csv"; private static final String[] VERSION_KEYS = new String[]{TIKA_VERSION, POI_VERSION , COLLECTIONS4_VERSION, COMPRESS_VERSION - , LANG3_VERSION, MATH3_VERSION}; + , LANG3_VERSION, MATH3_VERSION, COMMONS_CSV_VERSION}; private static final String EXPECTED_TEXT_FRAGMENT = "A sample document"; + private static final String EXPECTED_CSV_FRAGMENT = + "a,b\n" + + "a,b\n" + + "a,b\n" + + "a,b"; @Configuration public Option[] configuration() throws IOException { @@ -111,7 +117,8 @@ public class TikaExtractionOsgiIT { composite( mavenBundle("org.apache.tika", "tika-core", versions.get(TIKA_VERSION)) , mavenBundle("org.apache.tika", "tika-parsers", versions.get(TIKA_VERSION)) - + // for csv parsing + , mavenBundle("org.apache.commons", "commons-csv", versions.get(COMMONS_CSV_VERSION)) // poi dependency start , wrappedBundle(mavenBundle("org.apache.poi", "poi", versions.get(POI_VERSION))) , wrappedBundle(mavenBundle("org.apache.poi", "poi-scratchpad", versions.get(POI_VERSION))) @@ -199,7 +206,16 @@ public class TikaExtractionOsgiIT { assertFileContains("test.txt"); } + @Test + public void csv() throws Exception { + assertFileContains("test2.txt", EXPECTED_CSV_FRAGMENT); + } + private void assertFileContains(String resName) throws Exception { + assertFileContains(resName, EXPECTED_TEXT_FRAGMENT); + } + + private void assertFileContains(String resName, String parsedContent) throws Exception { AutoDetectParser parser = new AutoDetectParser(registeredParser); ContentHandler handler = new WriteOutContentHandler(); Metadata metadata = new Metadata(); @@ -210,10 +226,9 @@ public class TikaExtractionOsgiIT { parser.parse(stream, handler, metadata); String actual = handler.toString().trim(); - assertEquals(EXPECTED_TEXT_FRAGMENT, actual); + assertEquals(parsedContent, actual); } finally { stream.close(); } - } } diff --git a/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt b/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt new file mode 100644 index 0000000000..eeb6314169 --- /dev/null +++ b/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt @@ -0,0 +1,4 @@ +a,b +a,b +a,b +a,b \ No newline at end of file diff --git a/oak-it-osgi/src/test/resources/versions.properties b/oak-it-osgi/src/test/resources/versions.properties index fd35fabac1..0c4d539a52 100644 --- a/oak-it-osgi/src/test/resources/versions.properties +++ b/oak-it-osgi/src/test/resources/versions.properties @@ -20,3 +20,4 @@ commons-collections4=4.4 commons-compress=1.27.1 commons-lang3=3.13.0 commons-math3=3.6.1 +commons-csv=1.14.0 diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml index 09003b596e..8eee64bf3c 100644 --- a/oak-parent/pom.xml +++ b/oak-parent/pom.xml @@ -601,6 +601,11 @@ <artifactId>commons-text</artifactId> <version>1.14.0</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + <version>1.14.0</version> + </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> diff --git a/oak-run-elastic/pom.xml b/oak-run-elastic/pom.xml index f35931caf9..11568422a4 100644 --- a/oak-run-elastic/pom.xml +++ b/oak-run-elastic/pom.xml @@ -41,8 +41,9 @@ 103.5 MB: Azure Identity client library for Java (OAK-10604) 105 MB: Azure updates 107 MB: RDB/Tomcat (OAK-10752) + 120 MB: added oak-commons for csv parsing (OAK-10790) --> - <max.jar.size>119000000</max.jar.size> + <max.jar.size>120000000</max.jar.size> </properties>
