This is an automated email from the ASF dual-hosted git repository.
mkataria pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new e187408743 OAK-10790: FullTextBinaryTextExtractor fails to extract
text from csv (#2414)
e187408743 is described below
commit e187408743b8605406ec2a009c63009f0d6770dc
Author: Mohit Kataria <[email protected]>
AuthorDate: Mon Aug 4 14:07:24 2025 +0530
OAK-10790: FullTextBinaryTextExtractor fails to extract text from csv
(#2414)
* OAK-10790: FullTextBinaryTextExtractor fails to extract text from csv
---
oak-it-osgi/pom.xml | 1 +
.../jackrabbit/oak/osgi/TikaExtractionOsgiIT.java | 23 ++++++++++++++++++----
.../org/apache/jackrabbit/oak/osgi/test2.txt | 4 ++++
oak-it-osgi/src/test/resources/versions.properties | 1 +
oak-parent/pom.xml | 5 +++++
oak-run-elastic/pom.xml | 3 ++-
6 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/oak-it-osgi/pom.xml b/oak-it-osgi/pom.xml
index 02f296285e..0bdf1dfb3f 100644
--- a/oak-it-osgi/pom.xml
+++ b/oak-it-osgi/pom.xml
@@ -82,6 +82,7 @@
<configuration>
<excludes>
<exclude>**/test.txt</exclude>
+ <exclude>**/test2.txt</exclude>
<exclude>**/test.rtf</exclude>
<exclude>**/test.doc</exclude>
<exclude>**/test.docx</exclude>
diff --git
a/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
b/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
index bf4234d750..9233a6f3ba 100644
---
a/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
+++
b/oak-it-osgi/src/test/java/org/apache/jackrabbit/oak/osgi/TikaExtractionOsgiIT.java
@@ -64,11 +64,17 @@ public class TikaExtractionOsgiIT {
private static final String COMPRESS_VERSION = "commons-compress";
private static final String LANG3_VERSION = "commons-lang3";
private static final String MATH3_VERSION = "commons-math3";
+ private static final String COMMONS_CSV_VERSION = "commons-csv";
private static final String[] VERSION_KEYS = new String[]{TIKA_VERSION,
POI_VERSION
, COLLECTIONS4_VERSION, COMPRESS_VERSION
- , LANG3_VERSION, MATH3_VERSION};
+ , LANG3_VERSION, MATH3_VERSION, COMMONS_CSV_VERSION};
private static final String EXPECTED_TEXT_FRAGMENT = "A sample document";
+ private static final String EXPECTED_CSV_FRAGMENT =
+ "a,b\n" +
+ "a,b\n" +
+ "a,b\n" +
+ "a,b";
@Configuration
public Option[] configuration() throws IOException {
@@ -111,7 +117,8 @@ public class TikaExtractionOsgiIT {
composite(
mavenBundle("org.apache.tika", "tika-core",
versions.get(TIKA_VERSION))
, mavenBundle("org.apache.tika", "tika-parsers",
versions.get(TIKA_VERSION))
-
+ // for csv parsing
+ , mavenBundle("org.apache.commons", "commons-csv",
versions.get(COMMONS_CSV_VERSION))
// poi dependency start
, wrappedBundle(mavenBundle("org.apache.poi", "poi",
versions.get(POI_VERSION)))
, wrappedBundle(mavenBundle("org.apache.poi",
"poi-scratchpad", versions.get(POI_VERSION)))
@@ -199,7 +206,16 @@ public class TikaExtractionOsgiIT {
assertFileContains("test.txt");
}
+ @Test
+ public void csv() throws Exception {
+ assertFileContains("test2.txt", EXPECTED_CSV_FRAGMENT);
+ }
+
private void assertFileContains(String resName) throws Exception {
+ assertFileContains(resName, EXPECTED_TEXT_FRAGMENT);
+ }
+
+ private void assertFileContains(String resName, String parsedContent)
throws Exception {
AutoDetectParser parser = new AutoDetectParser(registeredParser);
ContentHandler handler = new WriteOutContentHandler();
Metadata metadata = new Metadata();
@@ -210,10 +226,9 @@ public class TikaExtractionOsgiIT {
parser.parse(stream, handler, metadata);
String actual = handler.toString().trim();
- assertEquals(EXPECTED_TEXT_FRAGMENT, actual);
+ assertEquals(parsedContent, actual);
} finally {
stream.close();
}
-
}
}
diff --git
a/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt
b/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt
new file mode 100644
index 0000000000..eeb6314169
--- /dev/null
+++ b/oak-it-osgi/src/test/resources/org/apache/jackrabbit/oak/osgi/test2.txt
@@ -0,0 +1,4 @@
+a,b
+a,b
+a,b
+a,b
\ No newline at end of file
diff --git a/oak-it-osgi/src/test/resources/versions.properties
b/oak-it-osgi/src/test/resources/versions.properties
index fd35fabac1..0c4d539a52 100644
--- a/oak-it-osgi/src/test/resources/versions.properties
+++ b/oak-it-osgi/src/test/resources/versions.properties
@@ -20,3 +20,4 @@ commons-collections4=4.4
commons-compress=1.27.1
commons-lang3=3.13.0
commons-math3=3.6.1
+commons-csv=1.14.0
diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml
index f66479e048..ddd7479659 100644
--- a/oak-parent/pom.xml
+++ b/oak-parent/pom.xml
@@ -601,6 +601,11 @@
<artifactId>commons-text</artifactId>
<version>1.14.0</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ <version>1.14.0</version>
+ </dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
diff --git a/oak-run-elastic/pom.xml b/oak-run-elastic/pom.xml
index f35931caf9..11568422a4 100644
--- a/oak-run-elastic/pom.xml
+++ b/oak-run-elastic/pom.xml
@@ -41,8 +41,9 @@
103.5 MB: Azure Identity client library for Java (OAK-10604)
105 MB: Azure updates
107 MB: RDB/Tomcat (OAK-10752)
+ 120 MB: added oak-commons for csv parsing (OAK-10790)
-->
- <max.jar.size>119000000</max.jar.size>
+ <max.jar.size>120000000</max.jar.size>
</properties>