This is an automated email from the ASF dual-hosted git repository.

mkataria pushed a commit to branch OAK-10790
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git

commit 35f7d3fb8f453c498a4594abcec8f6127b28021f
Author: Mohit Kataria <[email protected]>
AuthorDate: Wed Jul 30 11:51:46 2025 +0530

    OAK-10790: FullTextBinaryTextExtractor fails to extract text from csv
---
 oak-it-osgi/pom.xml                                |  1 +
 .../jackrabbit/oak/osgi/TikaExtractionOsgiIT.java  | 23 ++++++++++++++++++----
 .../org/apache/jackrabbit/oak/osgi/test2.txt       |  4 ++++
 oak-it-osgi/src/test/resources/versions.properties |  1 +
 oak-parent/pom.xml                                 |  5 +++++
 oak-run-elastic/pom.xml                            |  3 ++-
 6 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/oak-it-osgi/pom.xml b/oak-it-osgi/pom.xml
index 02f296285e..0bdf1dfb3f 100644
--- a/oak-it-osgi/pom.xml
+++ b/oak-it-osgi/pom.xml
@@ -82,6 +82,7 @@
         <configuration>
           <excludes>
             <exclude>**/test.txt</exclude>
+            <exclude>**/test2.txt</exclude>
             <exclude>**/test.rtf</exclude>
             <exclude>**/test.doc</exclude>
             <exclude>**/test.docx</exclude>
diff --git 
a/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
 
b/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
index bf4234d750..faedf80ca1 100644
--- 
a/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
+++ 
b/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
@@ -64,11 +64,17 @@ public class TikaExtractionOsgiIT {
     private static final String COMPRESS_VERSION = "commons-compress";
     private static final String LANG3_VERSION = "commons-lang3";
     private static final String MATH3_VERSION = "commons-math3";
+    private static final String COMMONS_CSV_VERSION = "commons-csv";
     private static final String[] VERSION_KEYS = new String[]{TIKA_VERSION, 
POI_VERSION
             , COLLECTIONS4_VERSION, COMPRESS_VERSION
-            , LANG3_VERSION, MATH3_VERSION};
+            , LANG3_VERSION, MATH3_VERSION, COMMONS_CSV_VERSION};
 
     private static final String EXPECTED_TEXT_FRAGMENT = "A sample document";
+    private static final String EXPECTED_CSV_FRAGMENT =
+        "a,b\n" +
+        "a,b\n" +
+        "a,b\n" +
+        "a,b";
 
     @Configuration
     public Option[] configuration() throws IOException {
@@ -111,7 +117,8 @@ public class TikaExtractionOsgiIT {
                 composite(
                         mavenBundle("org.apache.tika", "tika-core", 
versions.get(TIKA_VERSION))
                         , mavenBundle("org.apache.tika", "tika-parsers", 
versions.get(TIKA_VERSION))
-
+                        // for csv parsing
+                        , mavenBundle("org.apache.commons", "commons-csv", 
versions.get(COMMONS_CSV_VERSION))
                         // poi dependency start
                         , wrappedBundle(mavenBundle("org.apache.poi", "poi", 
versions.get(POI_VERSION)))
                         , wrappedBundle(mavenBundle("org.apache.poi", 
"poi-scratchpad", versions.get(POI_VERSION)))
@@ -199,7 +206,16 @@ public class TikaExtractionOsgiIT {
         assertFileContains("test.txt");
     }
 
+    @Test
+    public void csv() throws Exception {
+        assertFileContains("test2.txt", EXPECTED_CSV_FRAGMENT);
+    }
+
     private void assertFileContains(String resName) throws Exception {
+            assertFileContains(resName, EXPECTED_TEXT_FRAGMENT);
+    }
+
+    private void assertFileContains(String resName, String parsedContent) 
throws Exception {
         AutoDetectParser parser = new AutoDetectParser(registeredParser);
         ContentHandler handler = new WriteOutContentHandler();
         Metadata metadata = new Metadata();
@@ -210,10 +226,9 @@ public class TikaExtractionOsgiIT {
             parser.parse(stream, handler, metadata);
 
             String actual = handler.toString().trim();
-            assertEquals(EXPECTED_TEXT_FRAGMENT, actual);
+            assertEquals(parsedContent, actual);
         } finally {
             stream.close();
         }
-
     }
 }
diff --git 
a/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt 
b/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt
new file mode 100644
index 0000000000..eeb6314169
--- /dev/null
+++ b/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt
@@ -0,0 +1,4 @@
+a,b
+a,b
+a,b
+a,b
\ No newline at end of file
diff --git a/oak-it-osgi/src/test/resources/versions.properties 
b/oak-it-osgi/src/test/resources/versions.properties
index fd35fabac1..0c4d539a52 100644
--- a/oak-it-osgi/src/test/resources/versions.properties
+++ b/oak-it-osgi/src/test/resources/versions.properties
@@ -20,3 +20,4 @@ commons-collections4=4.4
 commons-compress=1.27.1
 commons-lang3=3.13.0
 commons-math3=3.6.1
+commons-csv=1.14.0
diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml
index 09003b596e..8eee64bf3c 100644
--- a/oak-parent/pom.xml
+++ b/oak-parent/pom.xml
@@ -601,6 +601,11 @@
         <artifactId>commons-text</artifactId>
         <version>1.14.0</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.commons</groupId>
+        <artifactId>commons-csv</artifactId>
+        <version>1.14.0</version>
+      </dependency>
       <dependency>
         <groupId>commons-io</groupId>
         <artifactId>commons-io</artifactId>
diff --git a/oak-run-elastic/pom.xml b/oak-run-elastic/pom.xml
index f35931caf9..11568422a4 100644
--- a/oak-run-elastic/pom.xml
+++ b/oak-run-elastic/pom.xml
@@ -41,8 +41,9 @@
         103.5 MB: Azure Identity client library for Java (OAK-10604)
         105 MB: Azure updates
         107 MB: RDB/Tomcat (OAK-10752)
+        120 MB: added oak-commons for csv parsing (OAK-10790)
         -->
-        <max.jar.size>119000000</max.jar.size>
+        <max.jar.size>120000000</max.jar.size>
                       
     </properties>
 

Reply via email to