This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git.
from a279d03 TIKA-2278 clean up extract exception handling
new b2a462c TIKA 2276 -- cleanup
new 6dcad88 TIKA-2273 -- improve configuration of encoding detectors.
TODO: figure out loading in tika-app bundle and turn tests back on.
new 5925bcb TIKA-2279 - simplify token counting
new 82509f3 TIKA-1857 xfa fix
The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.
Summary of changes:
CHANGES.txt | 3 +
.../tika/config/TikaEncodingDetectorTest.java | 198 +++++++++++++++++++++
...KA-2273-blacklist-encoding-detector-default.xml | 19 +-
...-2273-encoding-detector-outside-static-init.xml | 17 +-
.../TIKA-2273-no-icu4j-encoding-detector.xml | 23 ++-
.../TIKA-2273-non-detecting-params-bad-charset.xml | 18 +-
.../tika/config/TIKA-2273-non-detecting-params.xml | 24 ++-
.../TIKA-2273-parameterize-encoding-detector.xml | 25 ++-
.../test/java/org/apache/tika/bundle/BundleIT.java | 7 +-
.../java/org/apache/tika/config/ServiceLoader.java | 4 +-
.../java/org/apache/tika/config/TikaConfig.java | 181 +++++++++++++++++--
.../org/apache/tika/detect/AutoDetectReader.java | 23 ++-
.../tika/detect/CompositeEncodingDetector.java | 92 ++++++++++
.../tika/detect/DefaultEncodingDetector.java | 53 ++++++
.../org/apache/tika/detect/EncodingDetector.java | 3 +-
.../tika/detect/NonDetectingEncodingDetector.java | 67 +++++++
.../tika/extractor/EmbeddedDocumentUtil.java | 67 +++++--
.../parser/AbstractEncodingDetectorParser.java | 64 +++++++
.../java/org/apache/tika/parser/DefaultParser.java | 47 ++++-
.../src/test/java/org/apache/tika/TikaTest.java | 11 +-
.../org/apache/tika/config/TikaConfigTest.java | 4 +-
.../org.apache.tika.detect.EncodingDetector | 2 +-
.../org/apache/tika/eval/AbstractProfiler.java | 10 +-
.../eval/tokens/AlphaIdeographFilterFactory.java | 30 ++--
.../tika/eval/tokens/AnalyzerDeserializer.java | 15 +-
.../apache/tika/eval/tokens/AnalyzerManager.java | 19 +-
.../tika/eval/tokens/CommonTokenCountManager.java | 14 +-
.../apache/tika/eval/tokens/CommonTokenResult.java | 21 ++-
.../org/apache/tika/eval/tokens/TokenCounter.java | 24 +--
tika-eval/src/main/resources/lucene-analyzers.json | 39 ++--
.../org/apache/tika/eval/AnalyzerManagerTest.java | 32 +++-
.../tika/eval/tokens/LuceneTokenCounter.java | 8 +-
.../apache/tika/eval/tokens/TokenCounterTest.java | 19 +-
.../apache/tika/parser/code/SourceCodeParser.java | 26 +--
.../org/apache/tika/parser/pdf/XFAExtractor.java | 28 ++-
.../java/org/apache/tika/parser/chm/ChmParser.java | 2 +-
.../tika/parser/microsoft/JackcessExtractor.java | 9 +-
.../tika/parser/microsoft/OutlookExtractor.java | 15 +-
.../apache/tika/parser/envi/EnviHeaderParser.java | 14 +-
.../org/apache/tika/parser/isatab/ISATabUtils.java | 26 ++-
.../tika/parser/txt/Icu4jEncodingDetector.java | 23 +++
.../java/org/apache/tika/parser/txt/TXTParser.java | 21 ++-
.../org/apache/tika/parser/html/HtmlParser.java | 20 ++-
43 files changed, 1082 insertions(+), 285 deletions(-)
create mode 100644
tika-app/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
copy
tika-core/src/test/resources/org/apache/tika/config/TIKA-1700-unknown-parser.xml
=>
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-blacklist-encoding-detector-default.xml
(56%)
copy
tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
=>
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml
(56%)
copy
tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
=>
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml
(62%)
copy
tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
=>
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params-bad-charset.xml
(67%)
copy
tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
=>
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params.xml
(67%)
copy
tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
=>
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-parameterize-encoding-detector.xml
(63%)
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java
create mode 100644
tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java
copy {tika-parser-modules/tika-parser-web-module/src/main =>
tika-core/src/test}/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
(93%)
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].