This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new ce7f5891c TIKA-4452 -- remove FileProfiler.java from tika-eval (#2274)
ce7f5891c is described below
commit ce7f5891c77bd9bc81b9ccd1efcc32e27b65dbcd
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jul 8 17:55:44 2025 -0400
TIKA-4452 -- remove FileProfiler.java from tika-eval (#2274)
---
.../org/apache/tika/eval/app/FileProfiler.java | 177 ---------------------
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 63 +-------
.../tika/eval/app/batch/FileProfilerBuilder.java | 96 -----------
3 files changed, 2 insertions(+), 334 deletions(-)
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
deleted file mode 100644
index 925452094..000000000
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.FilenameUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.tika.Tika;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.detect.FileCommandDetector;
-import org.apache.tika.eval.app.db.ColInfo;
-import org.apache.tika.eval.app.db.Cols;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.eval.app.io.IDBWriter;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * This class profiles actual files as opposed to extracts e.g. {@link
ExtractProfiler}.
- * This does _not_ parse files, but does run file type identification and
digests the
- * raw bytes.
- * <p>
- * If the 'file' command is available on the command line, this will also run
the
- * FileCommandDetector.
- */
-
-public class FileProfiler extends AbstractProfiler {
-//TODO: we should allow users to select digest type/encoding and file
detector(s).
-
- public static final String DETECT_EXCEPTION = "detect-exception";
- private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
- private static final Logger LOG =
LoggerFactory.getLogger(FileProfiler.class);
- private static final Tika TIKA = new Tika();
- private static final FileCommandDetector FILE_COMMAND_DETECTOR = new
FileCommandDetector();
- public static TableInfo FILE_PROFILES = HAS_FILE ?
- new TableInfo("file_profiles", new ColInfo(Cols.FILE_PATH,
Types.VARCHAR, 2048, "PRIMARY KEY"), new ColInfo(Cols.FILE_NAME, Types.VARCHAR,
2048),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24), new
ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
- new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER), new
ColInfo(Cols.FILE_MIME_ID, Types.INTEGER)) :
- new TableInfo("file_profiles", new ColInfo(Cols.FILE_PATH,
Types.VARCHAR, 2048, "PRIMARY KEY"), new ColInfo(Cols.FILE_NAME, Types.VARCHAR,
2048),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24), new
ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
- new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
-
-
- public static TableInfo FILE_MIME_TABLE =
- new TableInfo("file_mimes", new ColInfo(Cols.MIME_ID,
Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR,
256),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
- static Options OPTIONS;
-
- static {
-
- Option inputDir = new Option("inputDir", true, "optional: directory
for original binary input documents." + " If not specified, -extracts is
crawled as is.");
-
- OPTIONS = new Options()
- .addOption(inputDir)
- .addOption("bc", "optional: tika-batch config file")
- .addOption("numConsumers", true, "optional: number of consumer
threads")
- .addOption("db", true, "db file to which to write results")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string.
Must specify this or -db <h2db>")
- .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or
specify via -Djdbc.driver")
- .addOption("tablePrefix", true, "EXPERT: optional prefix for
table names")
- .addOption("drop", false, "drop tables if they exist")
- .addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
-
- ;
-
- }
-
- private final Path inputDir;
-
- public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path
inputDir, IDBWriter dbWriter) {
- super(fileQueue, dbWriter);
- this.inputDir = inputDir;
- }
-
- public static void USAGE() {
- HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar FileProfiler
-inputDir docs -db mydb [-inputDir input]", "Tool: Profile",
FileProfiler.OPTIONS,
- "Note: for the default h2 db, do not include the .mv.db at the
end of the db name.");
- }
-
- @Override
- public boolean processFileResource(FileResource fileResource) {
- String relPath = fileResource
- .getMetadata()
- .get(FSProperties.FS_REL_PATH);
- try (InputStream is = fileResource.openInputStream()) {
- try (TikaInputStream tis = TikaInputStream.get(is)) {
- Path path = tis.getPath();
- Map<Cols, String> data = new HashMap<>();
- int tikaMimeId = writer.getMimeId(detectTika(tis));
- String fileName = "";
- String extension = "";
- long length = -1;
- try {
- fileName = FilenameUtils.getName(relPath);
- } catch (IllegalArgumentException e) {
- LOG.warn("bad file name: " + relPath, e);
- }
-
- try {
- extension = FilenameUtils.getExtension(relPath);
- } catch (IllegalArgumentException e) {
- LOG.warn("bad extension: " + relPath, e);
- }
-
- try {
- length = Files.size(path);
- } catch (IOException e) {
- LOG.warn("problem getting size: " + relPath, e);
- }
-
- data.put(Cols.FILE_PATH, relPath);
- data.put(Cols.FILE_NAME, fileName);
- data.put(Cols.FILE_EXTENSION, extension);
- data.put(Cols.LENGTH, Long.toString(length));
- data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
- data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
- if (HAS_FILE) {
- int fileMimeId = writer.getMimeId(detectFile(tis));
- data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
- }
- writer.writeRow(FILE_PROFILES, data);
- }
- } catch (IOException e) {
- //log at least!
- return false;
- }
- return true;
- }
-
- private String detectFile(TikaInputStream tis) {
- try {
- return FILE_COMMAND_DETECTOR
- .detect(tis, new Metadata())
- .toString();
- } catch (IOException e) {
- return DETECT_EXCEPTION;
- }
- }
-
- private String detectTika(TikaInputStream tis) {
- try {
- return TIKA.detect(tis);
- } catch (IOException e) {
- return DETECT_EXCEPTION;
- }
- }
-}
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index b44b0cf4a..10bb13bf7 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -34,7 +34,7 @@ import org.apache.tika.batch.fs.FSBatchProcessCLI;
import org.apache.tika.eval.app.reports.ResultsReporter;
public class TikaEvalCLI {
- static final String[] tools = {"Profile", "FileProfile", "Compare",
"Report", "StartDB"};
+ static final String[] tools = {"Profile", "Compare", "Report", "StartDB"};
private static String specifyTools() {
StringBuilder sb = new StringBuilder();
@@ -74,72 +74,13 @@ public class TikaEvalCLI {
case "StartDB":
handleStartDB(subsetArgs);
break;
- case "FileProfile":
- handleProfileFiles(subsetArgs);
- break;
+
default:
System.out.println(specifyTools());
break;
}
}
- private void handleProfileFiles(String[] subsetArgs) throws Exception {
- List<String> argList = new ArrayList<>(Arrays.asList(subsetArgs));
-
- boolean containsBC = false;
- String inputDir = null;
- //confirm there's a batch-config file
- for (String arg : argList) {
- if (arg.equals("-bc")) {
- containsBC = true;
- break;
- }
- }
-
- Path tmpBCConfig = null;
- try {
- tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
- if (!containsBC) {
- try (InputStream is = this
- .getClass()
-
.getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
- Files.copy(is, tmpBCConfig,
StandardCopyOption.REPLACE_EXISTING);
- }
- argList.add("-bc");
- argList.add(tmpBCConfig
- .toAbsolutePath()
- .toString());
- }
-
- String[] updatedArgs = argList.toArray(new String[0]);
- DefaultParser defaultCLIParser = new DefaultParser();
- try {
- CommandLine commandLine =
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
- if (commandLine.hasOption("db") &&
commandLine.hasOption("jdbc")) {
- System.out.println("Please specify either the default -db
or the full -jdbc, not both");
- FileProfiler.USAGE();
- return;
- }
- } catch (ParseException e) {
- System.out.println(e.getMessage() + "\n");
- FileProfiler.USAGE();
- return;
- }
-
- // lazy delete because main() calls System.exit()
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- tmpBCConfig
- .toFile()
- .deleteOnExit();
- }
- FSBatchProcessCLI.main(updatedArgs);
- } finally {
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- Files.delete(tmpBCConfig);
- }
- }
- }
-
private void handleStartDB(String[] args) throws SQLException {
List<String> argList = new ArrayList<>();
argList.add("-web");
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
deleted file mode 100644
index 7aeb8d7bc..000000000
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app.batch;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.app.ExtractProfiler;
-import org.apache.tika.eval.app.FileProfiler;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class FileProfilerBuilder extends EvalConsumerBuilder {
-
- public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
- private final List<TableInfo> tableInfos;
-
- public FileProfilerBuilder() {
- List<TableInfo> tableInfos = new ArrayList();
- tableInfos.add(FileProfiler.FILE_MIME_TABLE);
- tableInfos.add(FileProfiler.FILE_PROFILES);
- this.tableInfos = Collections.unmodifiableList(tableInfos);
-
- }
-
- @Override
- public FileResourceConsumer build() throws IOException, SQLException {
-
- Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
- //we _could_ set this to extracts (if not null)
- //here, but the Crawler defaults to "input" if nothing is passed
- //so this won't work
- if (inputDir == null) {
- throw new RuntimeException("Must specify -inputDir");
- }
- return parameterizeProfiler(new FileProfiler(queue, inputDir,
getDBWriter(tableInfos)));
- }
-
-
- @Override
- protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
- String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
- if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
- for (TableInfo tableInfo : tableInfos) {
- tableInfo.setNamePrefix(tableNamePrefix);
- }
- }
- }
-
- @Override
- protected List<TableInfo> getRefTableInfos() {
- return Collections.EMPTY_LIST;
- }
-
- @Override
- protected List<TableInfo> getNonRefTableInfos() {
- return tableInfos;
- }
-
- @Override
- protected TableInfo getMimeTable() {
- return FileProfiler.FILE_MIME_TABLE;
- }
-
- @Override
- protected void addErrorLogTablePairs(DBConsumersManager manager) {
- Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"),
null);
- if (errorLog == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLog,
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- }
-}