This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new ce7f5891c TIKA-4452 -- remove FileProfiler.java from tika-eval (#2274)
ce7f5891c is described below

commit ce7f5891c77bd9bc81b9ccd1efcc32e27b65dbcd
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jul 8 17:55:44 2025 -0400

    TIKA-4452 -- remove FileProfiler.java from tika-eval (#2274)
---
 .../org/apache/tika/eval/app/FileProfiler.java     | 177 ---------------------
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java |  63 +-------
 .../tika/eval/app/batch/FileProfilerBuilder.java   |  96 -----------
 3 files changed, 2 insertions(+), 334 deletions(-)

diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
deleted file mode 100644
index 925452094..000000000
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.FilenameUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.tika.Tika;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.detect.FileCommandDetector;
-import org.apache.tika.eval.app.db.ColInfo;
-import org.apache.tika.eval.app.db.Cols;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.eval.app.io.IDBWriter;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * This class profiles actual files as opposed to extracts e.g. {@link 
ExtractProfiler}.
- * This does _not_ parse files, but does run file type identification and 
digests the
- * raw bytes.
- * <p>
- * If the 'file' command is available on the command line, this will also run 
the
- * FileCommandDetector.
- */
-
-public class FileProfiler extends AbstractProfiler {
-//TODO: we should allow users to select digest type/encoding and file 
detector(s).
-
-    public static final String DETECT_EXCEPTION = "detect-exception";
-    private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
-    private static final Logger LOG = 
LoggerFactory.getLogger(FileProfiler.class);
-    private static final Tika TIKA = new Tika();
-    private static final FileCommandDetector FILE_COMMAND_DETECTOR = new 
FileCommandDetector();
-    public static TableInfo FILE_PROFILES = HAS_FILE ?
-            new TableInfo("file_profiles", new ColInfo(Cols.FILE_PATH, 
Types.VARCHAR, 2048, "PRIMARY KEY"), new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 
2048),
-                    new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24), new 
ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
-                    new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER), new 
ColInfo(Cols.FILE_MIME_ID, Types.INTEGER)) :
-            new TableInfo("file_profiles", new ColInfo(Cols.FILE_PATH, 
Types.VARCHAR, 2048, "PRIMARY KEY"), new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 
2048),
-                    new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24), new 
ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
-                    new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
-
-
-    public static TableInfo FILE_MIME_TABLE =
-            new TableInfo("file_mimes", new ColInfo(Cols.MIME_ID, 
Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 
256),
-                    new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12));
-    static Options OPTIONS;
-
-    static {
-
-        Option inputDir = new Option("inputDir", true, "optional: directory 
for original binary input documents." + " If not specified, -extracts is 
crawled as is.");
-
-        OPTIONS = new Options()
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer 
threads")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
-                .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
-
-        ;
-
-    }
-
-    private final Path inputDir;
-
-    public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path 
inputDir, IDBWriter dbWriter) {
-        super(fileQueue, dbWriter);
-        this.inputDir = inputDir;
-    }
-
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar FileProfiler 
-inputDir docs -db mydb [-inputDir input]", "Tool: Profile", 
FileProfiler.OPTIONS,
-                "Note: for the default h2 db, do not include the .mv.db at the 
end of the db name.");
-    }
-
-    @Override
-    public boolean processFileResource(FileResource fileResource) {
-        String relPath = fileResource
-                .getMetadata()
-                .get(FSProperties.FS_REL_PATH);
-        try (InputStream is = fileResource.openInputStream()) {
-            try (TikaInputStream tis = TikaInputStream.get(is)) {
-                Path path = tis.getPath();
-                Map<Cols, String> data = new HashMap<>();
-                int tikaMimeId = writer.getMimeId(detectTika(tis));
-                String fileName = "";
-                String extension = "";
-                long length = -1;
-                try {
-                    fileName = FilenameUtils.getName(relPath);
-                } catch (IllegalArgumentException e) {
-                    LOG.warn("bad file name: " + relPath, e);
-                }
-
-                try {
-                    extension = FilenameUtils.getExtension(relPath);
-                } catch (IllegalArgumentException e) {
-                    LOG.warn("bad extension: " + relPath, e);
-                }
-
-                try {
-                    length = Files.size(path);
-                } catch (IOException e) {
-                    LOG.warn("problem getting size: " + relPath, e);
-                }
-
-                data.put(Cols.FILE_PATH, relPath);
-                data.put(Cols.FILE_NAME, fileName);
-                data.put(Cols.FILE_EXTENSION, extension);
-                data.put(Cols.LENGTH, Long.toString(length));
-                data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
-                data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
-                if (HAS_FILE) {
-                    int fileMimeId = writer.getMimeId(detectFile(tis));
-                    data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
-                }
-                writer.writeRow(FILE_PROFILES, data);
-            }
-        } catch (IOException e) {
-            //log at least!
-            return false;
-        }
-        return true;
-    }
-
-    private String detectFile(TikaInputStream tis) {
-        try {
-            return FILE_COMMAND_DETECTOR
-                    .detect(tis, new Metadata())
-                    .toString();
-        } catch (IOException e) {
-            return DETECT_EXCEPTION;
-        }
-    }
-
-    private String detectTika(TikaInputStream tis) {
-        try {
-            return TIKA.detect(tis);
-        } catch (IOException e) {
-            return DETECT_EXCEPTION;
-        }
-    }
-}
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
index b44b0cf4a..10bb13bf7 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -34,7 +34,7 @@ import org.apache.tika.batch.fs.FSBatchProcessCLI;
 import org.apache.tika.eval.app.reports.ResultsReporter;
 
 public class TikaEvalCLI {
-    static final String[] tools = {"Profile", "FileProfile", "Compare", 
"Report", "StartDB"};
+    static final String[] tools = {"Profile", "Compare", "Report", "StartDB"};
 
     private static String specifyTools() {
         StringBuilder sb = new StringBuilder();
@@ -74,72 +74,13 @@ public class TikaEvalCLI {
             case "StartDB":
                 handleStartDB(subsetArgs);
                 break;
-            case "FileProfile":
-                handleProfileFiles(subsetArgs);
-                break;
+
             default:
                 System.out.println(specifyTools());
                 break;
         }
     }
 
-    private void handleProfileFiles(String[] subsetArgs) throws Exception {
-        List<String> argList = new ArrayList<>(Arrays.asList(subsetArgs));
-
-        boolean containsBC = false;
-        String inputDir = null;
-        //confirm there's a batch-config file
-        for (String arg : argList) {
-            if (arg.equals("-bc")) {
-                containsBC = true;
-                break;
-            }
-        }
-
-        Path tmpBCConfig = null;
-        try {
-            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
-            if (!containsBC) {
-                try (InputStream is = this
-                        .getClass()
-                        
.getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
-                    Files.copy(is, tmpBCConfig, 
StandardCopyOption.REPLACE_EXISTING);
-                }
-                argList.add("-bc");
-                argList.add(tmpBCConfig
-                        .toAbsolutePath()
-                        .toString());
-            }
-
-            String[] updatedArgs = argList.toArray(new String[0]);
-            DefaultParser defaultCLIParser = new DefaultParser();
-            try {
-                CommandLine commandLine = 
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
-                if (commandLine.hasOption("db") && 
commandLine.hasOption("jdbc")) {
-                    System.out.println("Please specify either the default -db 
or the full -jdbc, not both");
-                    FileProfiler.USAGE();
-                    return;
-                }
-            } catch (ParseException e) {
-                System.out.println(e.getMessage() + "\n");
-                FileProfiler.USAGE();
-                return;
-            }
-
-            // lazy delete because main() calls System.exit()
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                tmpBCConfig
-                        .toFile()
-                        .deleteOnExit();
-            }
-            FSBatchProcessCLI.main(updatedArgs);
-        } finally {
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                Files.delete(tmpBCConfig);
-            }
-        }
-    }
-
     private void handleStartDB(String[] args) throws SQLException {
         List<String> argList = new ArrayList<>();
         argList.add("-web");
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
deleted file mode 100644
index 7aeb8d7bc..000000000
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.app.batch;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.app.ExtractProfiler;
-import org.apache.tika.eval.app.FileProfiler;
-import org.apache.tika.eval.app.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class FileProfilerBuilder extends EvalConsumerBuilder {
-
-    public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
-    private final List<TableInfo> tableInfos;
-
-    public FileProfilerBuilder() {
-        List<TableInfo> tableInfos = new ArrayList();
-        tableInfos.add(FileProfiler.FILE_MIME_TABLE);
-        tableInfos.add(FileProfiler.FILE_PROFILES);
-        this.tableInfos = Collections.unmodifiableList(tableInfos);
-
-    }
-
-    @Override
-    public FileResourceConsumer build() throws IOException, SQLException {
-
-        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
-        //we _could_ set this to extracts (if not null)
-        //here, but the Crawler defaults to "input" if nothing is passed
-        //so this won't work
-        if (inputDir == null) {
-            throw new RuntimeException("Must specify -inputDir");
-        }
-        return parameterizeProfiler(new FileProfiler(queue, inputDir, 
getDBWriter(tableInfos)));
-    }
-
-
-    @Override
-    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
-        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
-        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
-            for (TableInfo tableInfo : tableInfos) {
-                tableInfo.setNamePrefix(tableNamePrefix);
-            }
-        }
-    }
-
-    @Override
-    protected List<TableInfo> getRefTableInfos() {
-        return Collections.EMPTY_LIST;
-    }
-
-    @Override
-    protected List<TableInfo> getNonRefTableInfos() {
-        return tableInfos;
-    }
-
-    @Override
-    protected TableInfo getMimeTable() {
-        return FileProfiler.FILE_MIME_TABLE;
-    }
-
-    @Override
-    protected void addErrorLogTablePairs(DBConsumersManager manager) {
-        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), 
null);
-        if (errorLog == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLog, 
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-    }
-}

Reply via email to