This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new 67a5e91 TIKA-2317 warn user if max content length is hit; allow
for easier parameterization by commandline
new c4888d5 Merge remote-tracking branch 'origin/2.x' into 2.x
67a5e91 is described below
commit 67a5e91b2a4157ee06f924280b0b828819c88223
Author: tballison <[email protected]>
AuthorDate: Thu Apr 6 12:11:32 2017 -0400
TIKA-2317 warn user if max content length is hit; allow for easier
parameterization by commandline
---
.../org/apache/tika/eval/AbstractProfiler.java | 97 ++++++++----
.../java/org/apache/tika/eval/ExtractComparer.java | 6 +
.../java/org/apache/tika/eval/ExtractProfiler.java | 6 +-
.../org/apache/tika/eval/XMLErrorLogUpdater.java | 12 +-
.../tika/eval/batch/EvalConsumerBuilder.java | 21 +++
.../tika/eval/batch/ExtractComparerBuilder.java | 4 +-
.../tika/eval/batch/ExtractProfilerBuilder.java | 5 +-
.../main/java/org/apache/tika/eval/db/Cols.java | 3 +-
.../java/org/apache/tika/eval/db/JDBCUtil.java | 14 +-
.../java/org/apache/tika/eval/db/MimeBuffer.java | 4 +-
.../java/org/apache/tika/eval/io/DBWriter.java | 10 +-
.../org/apache/tika/eval/io/ExtractReader.java | 9 +-
.../java/org/apache/tika/eval/io/XMLLogReader.java | 12 +-
.../java/org/apache/tika/eval/reports/Report.java | 12 +-
.../apache/tika/eval/reports/ResultsReporter.java | 5 +-
.../tika/eval/tokens/AnalyzerDeserializer.java | 26 +++-
.../apache/tika/eval/tokens/AnalyzerManager.java | 9 +-
.../tika/eval/tokens/CommonTokenCountManager.java | 8 +-
.../src/main/resources/comparison-reports.xml | 169 +++++++++++++++++----
tika-eval/src/main/resources/log4j.properties | 5 +-
tika-eval/src/main/resources/lucene-analyzers.json | 7 -
tika-eval/src/main/resources/profile-reports.xml | 1 +
.../main/resources/tika-eval-comparison-config.xml | 8 +
.../main/resources/tika-eval-profiler-config.xml | 5 +
.../org/apache/tika/eval/AnalyzerManagerTest.java | 11 +-
.../org/apache/tika/eval/SimpleComparerTest.java | 46 +++++-
.../java/org/apache/tika/eval/TikaEvalCLITest.java | 20 ++-
.../apache/tika/eval/db/AbstractBufferTest.java | 2 +-
.../apache/tika/eval/tokens/TokenCounterTest.java | 2 +-
.../single-file-profiler-crawl-extract-config.xml | 2 +-
30 files changed, 402 insertions(+), 139 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index f81d25e..1091537 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -108,8 +108,9 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
private static CommonTokenCountManager commonTokenCountManager;
private String lastExtractExtension = null;
- final AnalyzerManager analyzerManager;
- final TokenCounter tokenCounter;
+ AnalyzerManager analyzerManager;
+ TokenCounter tokenCounter;
+
public enum EXCEPTION_TYPE {
RUNTIME,
@@ -136,9 +137,11 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
private static Pattern FILE_NAME_CLEANER =
Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
- final static int FILE_PATH_MAX_LEN = 512;//max len for varchar for
file_path
- final static int MAX_STRING_LENGTH = 1000000;
- final static int MAX_LEN_FOR_LANG_ID = 20000;
+ final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for
file_path
+ int maxContentLength = 10000000;
+ int maxContentLengthForLangId = 50000;
+ int maxTokens = 200000;
+
//these remove runtime info from the stacktraces so
//that actual causes can be counted.
@@ -168,14 +171,45 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
super(fileQueue);
this.writer = writer;
langIder = new LanguageIDWrapper();
+ initAnalyzersAndTokenCounter(maxTokens);
+ }
+
+ private void initAnalyzersAndTokenCounter(int maxTokens) {
try {
- analyzerManager = AnalyzerManager.newInstance();
+ analyzerManager = AnalyzerManager.newInstance(maxTokens);
tokenCounter = new
TokenCounter(analyzerManager.getGeneralAnalyzer());
} catch (IOException e) {
throw new RuntimeException(e);
}
+
}
+ /**
+ * Truncate the content string if greater than this length to this length
+ * @param maxContentLength
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+
+ /**
+ * Truncate content string if greater than this length to this length for
lang id
+ *
+ * @param maxContentLengthForLangId
+ */
+ public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+ this.maxContentLengthForLangId = maxContentLengthForLangId;
+ }
+
+ /**
+ * Add a LimitTokenCountFilterFactory if > -1
+ *
+ * @param maxTokens
+ */
+ public void setMaxTokens(int maxTokens) {
+ this.maxTokens = maxTokens;
+ initAnalyzersAndTokenCounter(maxTokens);
+ }
protected void writeExtractException(TableInfo extractExceptionTable,
String containerId,
@@ -211,18 +245,15 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
//if the outer wrapper document
if (i == 0) {
-
data.put(Cols.IS_EMBEDDED, FALSE);
data.put(Cols.FILE_NAME,
fps.getRelativeSourceFilePath().getFileName().toString());
} else {
data.put(Cols.IS_EMBEDDED, TRUE);
data.put(Cols.FILE_NAME,
getFileName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
}
-
String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
data.put(Cols.FILE_EXTENSION, ext);
-
long srcFileLen = getSourceFileLength(m);
if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
data.put(Cols.LENGTH, Long.toString(srcFileLen));
@@ -236,7 +267,7 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
data.put(Cols.ELAPSED_TIME_MILLIS,
getTime(m));
- String content = getContent(m, MAX_STRING_LENGTH);
+ String content = getContent(m, maxContentLength);
if (content == null || content.trim().length() == 0) {
data.put(Cols.HAS_CONTENT, FALSE);
} else {
@@ -261,13 +292,13 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
try {
return FilenameUtils.getName(path);
} catch (IllegalArgumentException e) {
- logger.warn(e.getMessage() + " in "+path);
+ logger.warn("{} in {}", e.getMessage(), path);
}
path = path.replaceAll("\u0000", " ");
try {
return FilenameUtils.getName(path);
} catch (IllegalArgumentException e) {
- logger.warn("Again: " + e.getMessage() + " in "+path);
+ logger.warn("Again: {} in {}", e.getMessage(), path);
}
//give up
return "";
@@ -301,15 +332,14 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
if (m == null) {
return;
}
-
- String content = getContent(m, MAX_STRING_LENGTH);
+ Map<Cols, String> data = new HashMap<>();
+ String content = getContent(m, maxContentLength, data);
if (content == null || content.trim().length() == 0) {
return;
}
tokenCounter.clear(fieldName);
tokenCounter.add(fieldName, content);
- Map<Cols, String> data = new HashMap<>();
data.put(Cols.ID, fileId);
data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
langid(m, data);
@@ -322,7 +352,7 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
commonTokenResult =
commonTokenCountManager.countTokenOverlaps(langid,
tokenCounter.getTokens(fieldName));
} catch (IOException e) {
- logger.error(e.getMessage(), e);
+ logger.error("{}", e.getMessage(), e);
}
data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
data.put(Cols.NUM_COMMON_TOKENS,
Integer.toString(commonTokenResult.getCommonTokens()));
@@ -418,6 +448,24 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
}
}
+ /**
+ * Get the content and record in the data {@link
Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+ *
+ * @param metadata
+ * @param maxLength
+ * @param data
+ * @return
+ */
+ protected static String getContent(Metadata metadata, int maxLength,
Map<Cols, String> data) {
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+ String c = getContent(metadata, maxLength);
+ if (c.length() > maxLength) {
+ c = c.substring(0, maxLength);
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+ }
+ return c;
+
+ }
protected static String getContent(Metadata metadata, int maxLength) {
if (metadata == null) {
return "";
@@ -426,20 +474,17 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
if (c == null) {
return "";
}
- if (c.length() > maxLength) {
- c = c.substring(0, maxLength);
- }
return c;
}
void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
- String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+ String content = getContent(metadata, maxContentLengthForLangId);
if (content.length() < 200) {
return;
}
String s = content;
- if (content.length() > MAX_LEN_FOR_LANG_ID) {
- s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+ if (content.length() > maxContentLengthForLangId) {
+ s = content.substring(0, maxContentLengthForLangId);
}
Map<String, Integer> m = new HashMap<>();
Reader r = new StringReader(s);
@@ -486,13 +531,13 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
}
void langid(Metadata metadata, Map<Cols, String> data) {
- String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+ String content = getContent(metadata, maxContentLengthForLangId);
if (content.length() < 50) {
return;
}
String s = content;
- if (content.length() > MAX_LEN_FOR_LANG_ID) {
- s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+ if (content.length() > maxContentLengthForLangId) {
+ s = content.substring(0, maxContentLengthForLangId);
}
List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
if (probabilities.size() > 0) {
@@ -577,7 +622,7 @@ public abstract class AbstractProfiler extends
FileResourceConsumer {
try {
srcLen = Files.size(inputFile);
} catch (IOException e) {
- logger.warn("Couldn't get length for:
"+inputFile.toAbsolutePath());
+ logger.warn("Couldn't get length for: {}",
inputFile.toAbsolutePath());
}
return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index a50b710..9caef9f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -80,6 +80,12 @@ public class ExtractComparer extends AbstractProfiler {
.addOption("tablePrefixB", true, "EXPERT: optional prefix for
table names for B")
.addOption("drop", true, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
+ .addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats")
+ .addOption("maxContentLengthForLangId", true, "truncate
content beyond this length for language id")
+ .addOption("maxTokens", true, "maximum tokens to process,
default=200000")
+ .addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats, default=1000000")
+ .addOption("maxContentLengthForLangId", true, "truncate
content beyond this length for language id, default=50000")
+
;
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 1f9bfda..9b7ddc4 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -68,6 +68,9 @@ public class ExtractProfiler extends AbstractProfiler {
.addOption("tablePrefix", true, "EXPERT: optional prefix for
table names")
.addOption("drop", true, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
+ .addOption("maxTokens", true, "maximum tokens to process,
default=200000")
+ .addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats, default=1000000")
+ .addOption("maxContentLengthForLangId", true, "truncate
content beyond this length for language id, default=50000")
;
@@ -145,7 +148,8 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
- new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT)
+ new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
+ new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
);
private final Path inputDir;
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
index 5c1b371..499b6ac 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
@@ -49,13 +49,11 @@ import org.slf4j.LoggerFactory;
* the "comparisons" table. It should not be run in a multithreaded
environment.
*/
public class XMLErrorLogUpdater {
-
- protected static Logger LOGGER =
LoggerFactory.getLogger(ResultsReporter.class);
+ private static final Logger LOG =
LoggerFactory.getLogger(ResultsReporter.class);
private Statement statement;
public static void main(String[] args) throws Exception {
-
XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
Path xmlLogFileA = Paths.get(args[0]);
Path xmlLogFileB = Paths.get(args[1]);
@@ -174,9 +172,9 @@ public class XMLErrorLogUpdater {
int updated = statement.executeUpdate(sql);
if (updated == 0) {
//TODO: log
- LOGGER.warn("made no updates in xmlerrorlogupdater!");
+ LOG.warn("made no updates in xmlerrorlogupdater!");
} else if (updated > 1) {
- LOGGER.warn("made too many updates");
+ LOG.warn("made too many updates");
}
}
@@ -195,9 +193,9 @@ public class XMLErrorLogUpdater {
rs.close();
if (resultCount == 0) {
- LOGGER.warn("Should have found a container for: "+resourceId);
+ LOG.warn("Should have found a container for: {}", resourceId);
} else if (resultCount > 1) {
- LOGGER.error("Records ids should be unique:"+resourceId);
+ LOG.error("Records ids should be unique: {}", resourceId);
}
/*
if (containerId < 0) {
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index bad8f61..6e9b6c9 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -174,6 +174,27 @@ public abstract class EvalConsumerBuilder {
return new ExtractReader(alterExtractList, minExtractLength,
maxExtractLength);
}
+ FileResourceConsumer parameterizeProfiler(AbstractProfiler
abstractProfiler) {
+
+ int maxContentLength =
PropsUtil.getInt(localAttrs.get("maxContentLength"), -1);
+ if (maxContentLength > -1) {
+ abstractProfiler.setMaxContentLength(maxContentLength);
+ }
+
+ int maxContentLengthForLangId =
PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -1);
+ if (maxContentLengthForLangId > -1) {
+
abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
+ }
+
+ int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -1);
+ if (maxTokens > -1) {
+ abstractProfiler.setMaxTokens(maxTokens);
+ }
+
+
+ return abstractProfiler;
+ }
+
/*
public abstract Map<String, String> getIndexInfo();
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index b9c5ee3..3cd428a 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -90,9 +90,9 @@ public class ExtractComparerBuilder extends
EvalConsumerBuilder {
throw new RuntimeException("Must specify an -inputDir");
}
- return new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
+ return parameterizeProfiler(new ExtractComparer(queue, inputRootDir,
extractsA, extractsB,
buildExtractReader(localAttrs),
- getDBWriter(getNonRefTableInfos()));
+ getDBWriter(getNonRefTableInfos())));
}
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index f89eeb0..11310ee 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -78,11 +78,12 @@ public class ExtractProfilerBuilder extends
EvalConsumerBuilder {
if (extracts == null && inputDir != null) {
extracts = inputDir;
}
- return new ExtractProfiler(queue, inputDir, extracts,
+ return parameterizeProfiler(new ExtractProfiler(queue, inputDir,
extracts,
buildExtractReader(localAttrs),
- getDBWriter(tableInfos));
+ getDBWriter(tableInfos)));
}
+
@Override
protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index bf8784b..91917ec 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -57,6 +57,7 @@ public enum Cols {
TOKEN_LENGTH_STD_DEV,
UNICODE_CHAR_BLOCKS,
NUM_PAGES, //number of pages a document alleges it has
+ CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at
AbstractProfiler.MAX_STRING_LENGTH
//content comparisons
TOP_10_UNIQUE_TOKEN_DIFFS_A,
@@ -86,5 +87,5 @@ public enum Cols {
DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
DIR_NAME_B
- }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
index fb69bd2..aaf8403 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
@@ -37,10 +37,11 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.log4j.Logger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class JDBCUtil {
+ private static final Logger LOG = LoggerFactory.getLogger(JDBCUtil.class);
public enum CREATE_TABLE {
DROP_IF_EXISTS,
@@ -48,7 +49,6 @@ public class JDBCUtil {
THROW_EX_IF_EXISTS,
}
- public static Logger logger = Logger.getLogger(JDBCUtil.class);
private final String connectionString;
private String driverClass;
@@ -160,7 +160,7 @@ public class JDBCUtil {
}
return insertStatement.executeUpdate();
} catch (SQLException e) {
- logger.warn("couldn't insert data for this row: " +
e.getMessage());
+ LOG.warn("couldn't insert data for this row: {}", e.getMessage());
e.printStackTrace();
return -1;
}
@@ -177,7 +177,7 @@ public class JDBCUtil {
case Types.VARCHAR:
if (value != null && value.length() >
colInfo.getPrecision()) {
value = value.substring(0, colInfo.getPrecision());
- logger.warn("truncated varchar value in " +
colInfo.getName() + " : " + value);
+ LOG.warn("truncated varchar value in {} : {}",
colInfo.getName(), value);
}
st.setString(dbColOffset, value);
break;
@@ -204,11 +204,11 @@ public class JDBCUtil {
}
} catch (NumberFormatException e) {
if (!"".equals(value)) {
- logger.warn("number format exception: " + colInfo.getName() +
" : " + value);
+ LOG.warn("number format exception: {} : {}",
colInfo.getName(), value);
}
st.setNull(dbColOffset, colInfo.getType());
} catch (SQLException e) {
- logger.warn("sqlexception: " + colInfo + " : " + value);
+ LOG.warn("sqlexception: {} : {}", colInfo, value);
st.setNull(dbColOffset, colInfo.getType());
}
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
index d557f9f..073dd63 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -93,12 +93,12 @@ public class MimeBuffer extends AbstractDBBuffer {
* (e.g. "pdf").
* <p>
* This will has special handling for texty filetypes whose MimeTypes
- * don't currently return anything for {@link
org.apache.tika.mime.MimeType#getExtension};
+ * don't currently return anything for {@link MimeType#getExtension};
*
* @param contentType string representing a content type, for example:
"application/pdf"
* @param config config from which to get MimeRepository
* @return extension or empty string
- * @throws org.apache.tika.mime.MimeTypeException thrown if MimeTypes
can't parse the contentType
+ * @throws MimeTypeException thrown if MimeTypes can't parse the
contentType
*/
public static String getExtension(String contentType, TikaConfig
config)
throws MimeTypeException {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
index 9302712..7e0573f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
@@ -27,12 +27,13 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.log4j.Logger;
import org.apache.tika.eval.db.ColInfo;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.JDBCUtil;
import org.apache.tika.eval.db.MimeBuffer;
import org.apache.tika.eval.db.TableInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* This is still in its early stages. The idea is to
@@ -45,9 +46,9 @@ import org.apache.tika.eval.db.TableInfo;
* DBWriter creates its own PreparedStatements at initialization.
*/
public class DBWriter implements IDBWriter {
- private static final AtomicInteger WRITER_ID = new AtomicInteger();
+ private static final Logger LOG = LoggerFactory.getLogger(DBWriter.class);
- private static Logger logger = Logger.getLogger(DBWriter.class);
+ private static final AtomicInteger WRITER_ID = new AtomicInteger();
private final AtomicLong insertedRows = new AtomicLong();
private final Long commitEveryX = 1000L;
@@ -116,7 +117,7 @@ public class DBWriter implements IDBWriter {
dbUtil.insert(p, table, data);
long rows = insertedRows.incrementAndGet();
if (rows % commitEveryX == 0) {
- logger.debug("writer ("+myId+") is committing after "+ rows +
" rows");
+ LOG.debug("writer ({}) is committing after {} rows", myId,
rows);
conn.commit();
}
} catch (SQLException e) {
@@ -128,7 +129,6 @@ public class DBWriter implements IDBWriter {
try {
conn.commit();
} catch (SQLException e){
- e.printStackTrace();
throw new IOExceptionWithCause(e);
}
try {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
index 0364fef..20f8ab1 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory;
*/
public class ExtractReader {
+ private static final Logger LOG =
LoggerFactory.getLogger(ExtractReader.class);
public static final long IGNORE_LENGTH = -1L;
@@ -52,8 +53,8 @@ public class ExtractReader {
FIRST_ONLY, //take only the metadata list for the "container" document
CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into
the first
}
- private final static Logger LOGGER =
LoggerFactory.getLogger(ExtractReader.class);
- TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+ private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
private final ALTER_METADATA_LIST alterMetadataList;
private final long minExtractLength;
@@ -125,7 +126,7 @@ public class ExtractReader {
} else if (fileSuffixes.compression.equals("zip")) {
is = new ZCompressorInputStream(is);
} else {
- LOGGER.warn("Can't yet process compression of type: " +
fileSuffixes.compression);
+ LOG.warn("Can't yet process compression of type: {}",
fileSuffixes.compression);
return metadataList;
}
}
@@ -173,7 +174,7 @@ public class ExtractReader {
}
private List<Metadata> generateListFromTextFile(Reader reader,
- FileSuffixes fileSuffixes)
throws IOException {
+ FileSuffixes
fileSuffixes) throws IOException {
List<Metadata> metadataList = new ArrayList<>();
String content = IOUtils.toString(reader);
Metadata m = new Metadata();
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
index 2db160a..739525e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
@@ -28,13 +28,13 @@ import java.sql.SQLException;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
import org.apache.tika.parser.ParseContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class XMLLogReader {
-
- private final static Logger logger = Logger.getLogger(XMLLogReader.class);
+ private static final Logger LOG =
LoggerFactory.getLogger(XMLLogReader.class);
//class that wraps a logger's xml output
//into a single xml parseable input stream.
@@ -60,10 +60,10 @@ public class XMLLogReader {
handler.handleMsg(level, reader.getElementText());
} catch (IOException e) {
e.printStackTrace();
- logger.warn("Error parsing:
"+reader.getElementText());
+ LOG.warn("Error parsing: {}",
reader.getElementText());
} catch (SQLException e) {
e.printStackTrace();
- logger.warn("SQLException: "+e.getMessage());
+ LOG.warn("SQLException: {}", e.getMessage());
}
}
break;
@@ -94,8 +94,8 @@ public class XMLLogReader {
private LogXMLWrappingInputStream(InputStream xmlLogFileIs){
streams = new InputStream[3];
streams[0] = new
ByteArrayInputStream(HEADER.getBytes(StandardCharsets.UTF_8));
- streams[1] = xmlLogFileIs;
streams[2] = new
ByteArrayInputStream(FOOTER.getBytes(StandardCharsets.UTF_8));
+ streams[1] = xmlLogFileIs;
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
index b7e2c09..8ac7fca 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
@@ -31,7 +31,6 @@ import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-import org.apache.log4j.Logger;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Row;
@@ -39,13 +38,14 @@ import org.apache.poi.ss.usermodel.VerticalAlignment;
import org.apache.poi.xssf.streaming.SXSSFSheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* This class represents a single report.
*/
public class Report {
-
- static final Logger logger = Logger.getLogger(Report.class);
+ private static final Logger LOG = LoggerFactory.getLogger(Report.class);
final String NULL_VALUE = "";//TODO: make this configurable!!!
Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>();
@@ -60,7 +60,7 @@ public class Report {
String reportName;
public void writeReport(Connection c, Path reportsRoot) throws
SQLException, IOException {
- logger.info("Writing report: "+reportName + " to "+reportFilename);
+ LOG.info("Writing report: {} to {}", reportName, reportFilename);
dumpXLSX(c, reportsRoot);
}
@@ -173,6 +173,7 @@ public class Report {
}
break;
//fall through strings
+ case Types.BOOLEAN:
case Types.CHAR:
case Types.VARCHAR:
case Types.LONGNVARCHAR:
@@ -189,8 +190,7 @@ public class Report {
} else {
cell.setCellValue(rs.getString(colIndex));
}
- logger.warn("Couldn't find type for: " +
meta.getColumnType(colIndex) +
- ". Defaulting to String");
+ LOG.warn("Couldn't find type for: {}. Defaulting to String",
meta.getColumnType(colIndex));
}
}
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
index 072ba53..8f36d8b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
@@ -53,8 +53,7 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class ResultsReporter {
-
- protected static Logger LOGGER =
LoggerFactory.getLogger(ResultsReporter.class);
+ private static final Logger LOG =
LoggerFactory.getLogger(ResultsReporter.class);
private static Options OPTIONS;
@@ -257,7 +256,7 @@ public class ResultsReporter {
Path reportsRootDirectory =
Paths.get(commandLine.getOptionValue("rd", "reports"));
if (Files.isDirectory(reportsRootDirectory)) {
- LOGGER.warn("'Reports' directory exists. Will overwrite
existing reports.");
+ LOG.warn("'Reports' directory exists. Will overwrite
existing reports.");
}
resultsReporter.execute(c, reportsRootDirectory);
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
index 83ca557..2389309 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
@@ -35,6 +35,7 @@ import com.google.gson.JsonParseException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
@@ -52,6 +53,12 @@ class AnalyzerDeserializer implements
JsonDeserializer<Map<String, Analyzer>> {
private static final String PARAMS = "params";
private static final String COMMENT = "_comment";
+ private final int maxTokens;
+
+ AnalyzerDeserializer(int maxTokens) {
+ this.maxTokens = maxTokens;
+ }
+
@Override
public Map<String, Analyzer> deserialize(JsonElement element, Type type,
JsonDeserializationContext
jsonDeserializationContext) throws JsonParseException {
@@ -64,14 +71,14 @@ class AnalyzerDeserializer implements
JsonDeserializer<Map<String, Analyzer>> {
throw new IllegalArgumentException("Expecting top level
'analyzers:{}");
}
try {
- return buildAnalyzers(root);
+ return buildAnalyzers(root, maxTokens);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
- public static Map<String, Analyzer> buildAnalyzers(JsonElement value)
throws IOException {
+ public static Map<String, Analyzer> buildAnalyzers(JsonElement value, int
maxTokens) throws IOException {
if (! value.isJsonObject()) {
throw new IllegalArgumentException("Expecting map with analyzer
names/analyzer definitions");
}
@@ -79,13 +86,13 @@ class AnalyzerDeserializer implements
JsonDeserializer<Map<String, Analyzer>> {
JsonObject root = (JsonObject)value;
for (Map.Entry<String, JsonElement> e : root.entrySet()) {
String analyzerName = e.getKey();
- Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue());
+ Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue(),
maxTokens);
analyzers.put(analyzerName, analyzer);
}
return analyzers;
}
- public static Analyzer buildAnalyzer(String analyzerName, JsonElement
value) throws IOException {
+ public static Analyzer buildAnalyzer(String analyzerName, JsonElement
value, int maxTokens) throws IOException {
if (! value.isJsonObject()) {
throw new IllegalArgumentException("Expecting map of charfilter,
tokenizer, tokenfilters");
}
@@ -98,7 +105,7 @@ class AnalyzerDeserializer implements
JsonDeserializer<Map<String, Analyzer>> {
if (k.equals(CHAR_FILTERS)) {
charFilters = buildCharFilters(e.getValue(), analyzerName);
} else if (k.equals(TOKEN_FILTERS)) {
- tokenFilterFactories = buildTokenFilterFactories(e.getValue(),
analyzerName);
+ tokenFilterFactories = buildTokenFilterFactories(e.getValue(),
analyzerName, maxTokens);
} else if (k.equals(TOKENIZER)) {
tokenizerFactory = buildTokenizerFactory(e.getValue(),
analyzerName);
} else if (! k.equals(COMMENT)) {
@@ -212,7 +219,7 @@ class AnalyzerDeserializer implements
JsonDeserializer<Map<String, Analyzer>> {
}
private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement
el,
- String
analyzerName) throws IOException {
+ String
analyzerName, int maxTokens) throws IOException {
if (el == null || el.isJsonNull()) {
return null;
}
@@ -261,6 +268,13 @@ class AnalyzerDeserializer implements
JsonDeserializer<Map<String, Analyzer>> {
throw new IllegalArgumentException("While loading
"+analyzerName, e);
}
}
+
+ if (maxTokens > -1) {
+ Map<String, String> m = new HashMap<>();
+ m.put("maxTokenCount", Integer.toString(maxTokens));
+ ret.add(new LimitTokenCountFilterFactory(m));
+ }
+
if (ret.size() == 0) {
return new TokenFilterFactory[0];
}
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
index 0e951b8..c5aa831 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.nio.charset.StandardCharsets;
import java.util.Map;
import com.google.gson.Gson;
@@ -41,11 +42,11 @@ public class AnalyzerManager {
this.commonTokensAnalyzer = commonTokensAnalyzer;
}
- public static AnalyzerManager newInstance() throws IOException {
+ public static AnalyzerManager newInstance(int maxTokens) throws
IOException {
InputStream is =
AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json");
- Reader reader = new InputStreamReader(is, "UTF-8");
+ Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
GsonBuilder builder = new GsonBuilder();
- builder.registerTypeHierarchyAdapter(Map.class, new
AnalyzerDeserializer());
+ builder.registerTypeHierarchyAdapter(Map.class, new
AnalyzerDeserializer(maxTokens));
Gson gson = builder.create();
Map<String, Analyzer> map = gson.fromJson(reader, Map.class);
Analyzer general = map.get(GENERAL);
@@ -58,7 +59,7 @@ public class AnalyzerManager {
throw new JsonParseException("Must specify "+ COMMON_TOKENS + "
analyzer");
}
- return new AnalyzerManager(general,common);
+ return new AnalyzerManager(general, common);
}
/**
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
index c6d9947..9997152 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -36,9 +36,9 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class CommonTokenCountManager {
+ private static final Logger LOG =
LoggerFactory.getLogger(CommonTokenCountManager.class);
private final static Charset COMMON_TOKENS_CHARSET =
StandardCharsets.UTF_8;
- private final static Logger LOGGER =
LoggerFactory.getLogger(CommonTokenCountManager.class);
private final Path commonTokensDir;
@@ -56,7 +56,7 @@ public class CommonTokenCountManager {
//set to prevent npes later
Set<String> set = commonTokenMap.get(defaultLangCode);
if (set == null) {
- LOGGER.warn("No common tokens for default language:
'"+defaultLangCode+"'");
+ LOG.warn("No common tokens for default language:
'"+defaultLangCode+"'");
commonTokenMap.put(defaultLangCode, new HashSet<String>());
}
}
@@ -128,7 +128,7 @@ public class CommonTokenCountManager {
if (is == null) {
- LOGGER.warn("Couldn't find common tokens file for: '" +
langCode + "': " +
+ LOG.warn("Couldn't find common tokens file for: '" + langCode
+ "': " +
p.toAbsolutePath());
alreadyTriedToLoad.add(langCode);
return;
@@ -161,7 +161,7 @@ public class CommonTokenCountManager {
}
}
} catch (IOException e) {
- LOGGER.warn("IOException trying to read: '" + langCode + "'");
+ LOG.warn("IOException trying to read: '" + langCode + "'");
} finally {
IOUtils.closeQuietly(is);
}
diff --git a/tika-eval/src/main/resources/comparison-reports.xml
b/tika-eval/src/main/resources/comparison-reports.xml
index b980d69..b447335 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -45,59 +45,161 @@
having cnt > 1
order by cnt desc
</sql>
+ <!-- build mime indexes -->
+
<sql>create index if not exists pa_m_idx
on profiles_a (mime_type_id);
</sql>
+
<sql>
create index if not exists pb_m_idx
on profiles_b (mime_type_id);
</sql>
+ <!-- build exceptions comparison table -->
<sql>drop table if exists exceptions_compared</sql>
<sql>
create table exceptions_compared
- (mime_type_id integer primary key,
- exceptions_a integer,
- total_a integer,
- percent_exceptions_a double,
- exceptions_b integer,
- total_b integer,
- percent_exceptions_b double)
+ (mime_type_id_a integer, mime_type_id_b integer,
+ exceptions_a integer default 0,
+ total_a integer default 0,
+ percent_exceptions_a double default 0.0,
+ exceptions_b integer default 0,
+ total_b integer default 0,
+ percent_exceptions_b double default 0.0);
</sql>
<sql>
- insert into exceptions_compared (mime_type_id)
- select mime_type_id from mimes;
+
+ insert into exceptions_compared (mime_type_id_a, mime_type_id_b)
+ select ma.mime_type_id, mb.mime_type_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_type_id=a.mime_type_id
+ join mimes mb on mb.mime_type_id=b.mime_type_id
+ group by ma.mime_type_id, mb.mime_type_id
</sql>
<sql>
update exceptions_compared ec set total_a=(
- select count(1) as cnt from profiles_a
- where profiles_a.mime_type_id= ec.mime_type_id
- group by mime_type_id
- )
+ select count(1) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
+ group by pa.mime_type_id, pb.mime_type_id);
</sql>
<sql>
update exceptions_compared ec set total_b=(
- select count(1) as cnt from profiles_b
- where profiles_b.mime_type_id= ec.mime_type_id
- group by mime_type_id
- )
+ select count(1) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
+ group by pb.mime_type_id, pa.mime_type_id);
</sql>
<sql>
- update exceptions_compared ec set exceptions_a=( select count(1) as
- cnt from exceptions_a ea
+ update exceptions_compared ec set exceptions_a=
+ ( select count(1) as cnt from exceptions_a ea
join profiles_a pa on ea.id=pa.id
- where pa.mime_type_id= ec.mime_type_id
+ join profiles_b pb on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
and parse_exception_type_id=0
- group by mime_type_id )
+ group by pa.mime_type_id, pb.mime_type_id);
</sql>
<sql>
- update exceptions_compared ec set exceptions_b=(
- select count(1) as cnt from exceptions_b eb
- join profiles_b pb on eb.id=pb.id
- where pb.mime_type_id= ec.mime_type_id
+ update exceptions_compared ec set exceptions_b=
+ ( select count(1) as cnt from exceptions_b eb
+ join profiles_b pb on eb.id=pa.id
+ join profiles_a pa on pa.id=pb.id
+ where pa.mime_type_id= ec.mime_type_id_a
+ and pb.mime_type_id=ec.mime_type_id_b
and parse_exception_type_id=0
- group by mime_type_id )
+ group by pb.mime_type_id, pa.mime_type_id);
+ </sql>
+
+ <sql>
+ update exceptions_compared
+ set percent_exceptions_a =
+ (cast (exceptions_a as decimal))/(cast (total_a as decimal))
+ where total_a > 0
+ </sql>
+ <sql>
+ update exceptions_compared
+ set percent_exceptions_b =
+ (cast (exceptions_b as decimal))/(cast (total_b as decimal))
+ where total_b > 0
+ </sql>
+
+ <!-- build tmp common words table -->
+ <!-- build exceptions comparison table -->
+ <sql>drop table if exists token_counts_compared</sql>
+ <sql>
+ create table token_counts_compared
+ (mime_type_id integer primary key,
+ num_tokens_a integer default 0,
+ num_alphabetic_tokens_a integer default 0,
+ num_common_tokens_a integer default 0,
+ num_tokens_b integer default 0,
+ num_alphabetic_tokens_b integer default 0,
+ num_common_tokens_b integer default 0,
+ );
+ </sql>
+ <sql>
+ insert into token_counts_compared (mime_type_id)
+ select mime_type_id from mimes;
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_a=(
+ select sum(num_tokens) as cnt from profiles_a p
+ join contents_a c on c.id = p.id
+ where p.mime_type_id= tcc.mime_type_id
+ group by mime_type_id
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_b=(
+ select sum(num_tokens) as cnt from profiles_b p
+ join contents_a c on c.id = p.id
+ where p.mime_type_id= tcc.mime_type_id
+ group by mime_type_id
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_a=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_a p
+ join contents_a c on c.id = p.id
+ where p.mime_type_id= tcc.mime_type_id
+ group by mime_type_id
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_b=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_b p
+ join contents_b c on c.id = p.id
+ where p.mime_type_id= tcc.mime_type_id
+ group by mime_type_id
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_a=(
+ select sum(num_common_tokens) as cnt from profiles_a p
+ join contents_a c on c.id = p.id
+ where p.mime_type_id= tcc.mime_type_id
+ group by mime_type_id
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_b=(
+ select sum(num_common_tokens) as cnt from profiles_b p
+ join contents_b c on c.id = p.id
+ where p.mime_type_id= tcc.mime_type_id
+ group by mime_type_id
+ );
</sql>
</before>
@@ -667,6 +769,21 @@
limit 100000
</sql>
</report>
+ <report reportName="ExceptionComparisonsByMimeType"
+ reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string, mb.mime_string, exceptions_a,
+ total_a, percent_exceptions_a,
+ exceptions_b, total_b, percent_exceptions_b
+ from exceptions_compared c
+ join mimes ma on ma.mime_type_id=c.mime_type_id_a
+ join mimes mb on mb.mime_type_id=c.mime_type_id_b
+ order by percent_exceptions_b desc, total_b desc;
+ </sql>
+ </report>
<!-- <report reportName="MD5 Duplicate Counts A"
reportFilename="md5/md5_duplicate_counts_A.xlsx"
format="xlsx"
diff --git a/tika-eval/src/main/resources/log4j.properties
b/tika-eval/src/main/resources/log4j.properties
index 925f9f2..22a5bfc 100644
--- a/tika-eval/src/main/resources/log4j.properties
+++ b/tika-eval/src/main/resources/log4j.properties
@@ -1,8 +1,5 @@
-log4j.rootLogger=WARN,A1
-
-#for debugging
-#log4j.rootLogger=TRACE,A1
+log4j.rootLogger=INFO,A1
log4j.appender.A1=org.apache.log4j.ConsoleAppender
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json
b/tika-eval/src/main/resources/lucene-analyzers.json
index 663ebe2..aa24b79 100644
--- a/tika-eval/src/main/resources/lucene-analyzers.json
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -23,13 +23,6 @@
"params": {
"outputUnigrams": "false"
}
- },
- {
- "factory": "oala.miscellaneous.LimitTokenCountFilterFactory",
- "params": {
- "maxTokenCount": "100000",
- "consumeAllTokens": "false"
- }
}
]
},
diff --git a/tika-eval/src/main/resources/profile-reports.xml
b/tika-eval/src/main/resources/profile-reports.xml
index fabddc8..1f9be6a 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -114,6 +114,7 @@
reportFilename="exceptions/exceptions_by_type.xlsx"
format="xlsx"
includeSql="true">
+
<sql>
select parse_exception_description, count(1) cnt
from parse_exceptions e
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index 9b3697b..887a3e7 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -59,6 +59,12 @@
description="EXPERT: prefix for table names for B"/>
<option opt="drop" hasArg="false" description="drop tables if they
exist"/>
<option opt="maxFilesToAdd" hasArg="true" description="maximum number
of files to add to the crawler"/>
+ <option opt="maxTokens" hasArg="true" description="maximum tokens to
process, default=200000"/>
+ <option opt="maxContentLength" hasArg="true"
+ description="truncate content beyond this length for
calculating 'contents' stats, default=1000000"/>
+ <option opt="maxContentLengthForLangId" hasArg="true"
+ description="truncate content beyond this length for language
id, default=50000"/>
+
</commandline>
@@ -85,6 +91,8 @@
commonTokens="resources/common_tokens"
/>
+ <!-- langModelDir="resources/langmodels" -->
+
<!-- reporter and interrupter are optional -->
<reporter
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder"
sleepMillis="1000"
staleThresholdMillis="500000"/>
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index 9da2aeb..a7e6d03 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -54,6 +54,11 @@
description="EXPERT: prefix for table names"/>
<option opt="drop" hasArg="false" description="drop tables if they
exist"/>
<option opt="maxFilesToAdd" hasArg="true" description="maximum number
of files to add to the crawler"/>
+ <option opt="maxTokens" hasArg="true" description="maximum tokens to
process, default=200000"/>
+ <option opt="maxContentLength" hasArg="true"
+ description="truncate content beyond this length for
calculating 'contents' stats, default=1000000"/>
+ <option opt="maxContentLengthForLangId" hasArg="true"
+ description="truncate content beyond this length for language
id, default=50000"/>
diff --git
a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
index 7a8a8fb..9caacd7 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
@@ -36,7 +36,7 @@ public class AnalyzerManagerTest {
@Test
public void testGeneral() throws Exception {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer general = analyzerManager.getGeneralAnalyzer();
TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD
dirty dog");
ts.reset();
@@ -57,7 +57,7 @@ public class AnalyzerManagerTest {
@Test
public void testCommon() throws Exception {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer common = analyzerManager.getCommonTokensAnalyzer();
TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
ts.reset();
@@ -80,21 +80,20 @@ public class AnalyzerManagerTest {
@Test
public void testTokenCountFilter() throws Exception {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
StringBuilder sb = new StringBuilder();
- for (int i = 0; i < 101000; i++) {
+ for (int i = 0; i < 1001000; i++) {
sb.append("the ");
}
TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f",
sb.toString());
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
- Set<String> seen = new HashSet<>();
int tokens = 0;
while (ts.incrementToken()) {
tokens++;
}
- assertEquals(100000, tokens);
+ assertEquals(1000000, tokens);
}
diff --git
a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 85e91dd..761f961 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -26,6 +26,7 @@ import static org.junit.Assert.assertTrue;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
@@ -160,20 +161,24 @@ public class SimpleComparerTest extends TikaTest {
public void testGetContent() throws Exception {
Metadata m = new Metadata();
m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
-
- String content = getContent(m, 10);
+ Map<Cols, String> data = new HashMap<>();
+ String content = getContent(m, 10, data);
assertEquals(10, content.length());
+ assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
- content = getContent(m, 4);
+ content = getContent(m, 4, data);
assertEquals(4, content.length());
+ assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test Metadata with no content
- content = getContent(new Metadata(), 10);
+ content = getContent(new Metadata(), 10, data);
assertEquals(0, content.length());
+ assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test null Metadata
- content = getContent(null, 10);
+ content = getContent(null, 10, data);
assertEquals(0, content.length());
+ assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
}
@Test
@@ -288,4 +293,35 @@ public class SimpleComparerTest extends TikaTest {
System.out.println(key + " : " + row.get(key));
}
}
+
+ @Test
+ @Ignore("useful for testing 2 files not in test set")
+ public void oneOff() throws Exception {
+ Path p1 = Paths.get("");
+ Path p2 = Paths.get("");
+
+ EvalFilePaths fpsA = new EvalFilePaths(
+ Paths.get("file1.pdf.json"),
+ p1
+ );
+ EvalFilePaths fpsB = new EvalFilePaths(
+ Paths.get("file1.pdf.json"),
+ p2
+ );
+ comparer.compareFiles(fpsA, fpsB);
+ for (TableInfo t : new TableInfo[]{
+ ExtractComparer.COMPARISON_CONTAINERS,
+ ExtractComparer.EXTRACT_EXCEPTION_TABLE_A,
+ ExtractComparer.EXTRACT_EXCEPTION_TABLE_B,
+ ExtractComparer.EXCEPTION_TABLE_A,
+ ExtractComparer.EXCEPTION_TABLE_B,
+ ExtractComparer.PROFILES_A,
+ ExtractComparer.PROFILES_B,
+ ExtractComparer.CONTENTS_TABLE_A,
+ ExtractComparer.CONTENTS_TABLE_B,
+ ExtractComparer.CONTENT_COMPARISONS}) {
+ debugPrintTable(t);
+ }
+
+ }
}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
index 5274dd4..288f042 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -58,7 +58,7 @@ public class TikaEvalCLITest extends TikaTest {
compareDBDir = Files.createTempDirectory("tika-eval-cli-compare-db-");
profileDBDir = Files.createTempDirectory("tika-eval-cli-profile-db-");
compareReportsDir =
Files.createTempDirectory("tika-eval-cli-compare-reports-");
- profileReportsDir =
Files.createTempDirectory("tika-eval-cli-compare-reports-");
+ profileReportsDir =
Files.createTempDirectory("tika-eval-cli-profile-reports-");
compare();
profile();
reportCompare();
@@ -115,7 +115,7 @@ public class TikaEvalCLITest extends TikaTest {
cnt++;
}
}
- assertTrue(cnt > 5);
+ assertTrue(cnt > 33);
}
@@ -127,6 +127,14 @@ public class TikaEvalCLITest extends TikaTest {
args.add(extractsDir.resolve("extractsA").toAbsolutePath().toString());
args.add("-extractsB");
args.add(extractsDir.resolve("extractsB").toAbsolutePath().toString());
+ //add these just to confirm this info doesn't cause problems w cli
+ args.add("-maxTokens");
+ args.add("10000000");
+ args.add("-maxContentLength");
+ args.add("100000000");
+ args.add("-maxContentLengthForLangId");
+ args.add("100000");
+
args.add("-db");
args.add(compareDBDir.toAbsolutePath().toString()+"/"+dbName);
@@ -139,6 +147,14 @@ public class TikaEvalCLITest extends TikaTest {
args.add("Profile");
args.add("-extracts");
args.add(extractsDir.resolve("extractsA").toAbsolutePath().toString());
+ //add these just to confirm this info doesn't cause problems w cli
+ args.add("-maxTokens");
+ args.add("10000000");
+ args.add("-maxContentLength");
+ args.add("100000000");
+ args.add("-maxContentLengthForLangId");
+ args.add("100000");
+
args.add("-db");
args.add(profileDBDir.toAbsolutePath().toString()+"/"+dbName);
execute(args, 60000);
diff --git
a/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
b/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
index 7b5c3cb..810425b 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
@@ -47,7 +47,7 @@ public class AbstractBufferTest {
Collections.addAll(keys, new String[]{
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"});
- int numGets = 1000;
+ int numGets = 100;
int numTesters = 20;
AbstractDBBuffer b = new TestBuffer();
diff --git
a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
index 40abdaa..9c6325d 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
@@ -41,7 +41,7 @@ public class TokenCounterTest {
@BeforeClass
public static void setUp() throws IOException {
- analyzerManager = AnalyzerManager.newInstance();
+ analyzerManager = AnalyzerManager.newInstance(100000);
}
diff --git
a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
index de730ed..bb83f8b 100644
--- a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
+++ b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
@@ -66,7 +66,7 @@
consumerBuilderClass="org.apache.tika.eval.batch.ExtractProfilerBuilder"
errorLogFile="src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml"
extracts="src/test/resources/test-dirs/extractsA"
- commonTokens="src/test/resources/common_tokens_short.txt"/>
+ commonTokens="src/test/resources/common_tokens"/>
<!-- reporter and interrupter are optional -->
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].