This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e5036c357 TIKA-4507 -- improve tika-eval-app's commandline in 4.x
(#2357)
e5036c357 is described below
commit e5036c35757d4603fedbb924050c85c89d8bf514
Author: Tim Allison <[email protected]>
AuthorDate: Mon Oct 6 10:51:41 2025 -0400
TIKA-4507 -- improve tika-eval-app's commandline in 4.x (#2357)
---
.../src/main/java/org/apache/tika/eval/app/EvalConfig.java | 8 ++++++++
.../java/org/apache/tika/eval/app/ExtractComparerRunner.java | 12 ++++++++++++
.../java/org/apache/tika/eval/app/ExtractProfileRunner.java | 12 +++++++++++-
3 files changed, 31 insertions(+), 1 deletion(-)
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
index 5525180ed..fc0d72f0a 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java
@@ -85,4 +85,12 @@ public class EvalConfig {
jdbcDriverClass + '\'' + ", forceDrop=" + forceDrop + ",
maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + maxTokens + ",
maxContentLength=" + maxContentLength +
", numThreads=" + numWorkers + ", errorLogFile=" +
errorLogFile + '}';
}
+
+ public void setNumWorkers(int n) {
+ numWorkers = n;
+ }
+
+ public void setMaxExtractLength(long m) {
+ maxExtractLength = m;
+ }
}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
index 57f98d601..8f86ab81e 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java
@@ -77,8 +77,11 @@ public class ExtractComparerRunner {
+ " If not specified, -extracts is crawled as
is.").get())
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db
path").get())
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json
config file").get())
+
.addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of
worker threads").get())
+
.addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum
extract length").get())
;
}
+
public static void main(String[] args) throws Exception {
DefaultParser defaultCLIParser = new DefaultParser();
CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
@@ -87,6 +90,15 @@ public class ExtractComparerRunner {
Path extractsBDir = commandLine.hasOption('b') ?
Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify
extractsB dir: -b"));
Path inputDir = commandLine.hasOption('i') ?
Paths.get(commandLine.getOptionValue('i')) : extractsADir;
String dbPath = commandLine.hasOption('d') ?
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
+
+ if (commandLine.hasOption('n')) {
+
evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n')));
+ }
+
+ if (commandLine.hasOption('m')) {
+
evalConfig.setMaxExtractLength(Long.parseLong(commandLine.getOptionValue('m')));
+ }
+
String jdbcString = getJdbcConnectionString(dbPath);
execute(inputDir, extractsADir, extractsBDir, jdbcString, evalConfig);
}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
index 221df02fa..a73a2f579 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java
@@ -76,8 +76,11 @@ public class ExtractProfileRunner {
+ " If not specified, -extracts is crawled as
is.").get())
.addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db
path").get())
.addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json
config file").get())
- ;
+
.addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of
worker threads").get())
+
.addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum
extract length").get())
+ ;
}
+
public static void main(String[] args) throws Exception {
DefaultParser defaultCLIParser = new DefaultParser();
CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args);
@@ -86,6 +89,13 @@ public class ExtractProfileRunner {
Path inputDir = commandLine.hasOption('i') ?
Paths.get(commandLine.getOptionValue('i')) : extractsDir;
String dbPath = commandLine.hasOption('d') ?
commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d");
String jdbcString = getJdbcConnectionString(dbPath);
+ if (commandLine.hasOption('n')) {
+
evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n')));
+ }
+
+ if (commandLine.hasOption('m')) {
+
evalConfig.setMaxExtractLength(Long.parseLong(commandLine.getOptionValue('m')));
+ }
execute(inputDir, extractsDir, jdbcString, evalConfig);
}