This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git.

      from  a279d03   TIKA-2278    clean up extract exception handling
       new  b2a462c   TIKA 2276 -- cleanup
       new  6dcad88   TIKA-2273 -- improve configuration of encoding detectors. 
 TODO: figure out loading in tika-app bundle and turn tests back on.
       new  5925bcb   TIKA-2279 -   simplify token counting
       new  82509f3   TIKA-1857 xfa fix

The 4 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   3 +
 .../tika/config/TikaEncodingDetectorTest.java      | 198 +++++++++++++++++++++
 ...KA-2273-blacklist-encoding-detector-default.xml |  19 +-
 ...-2273-encoding-detector-outside-static-init.xml |  17 +-
 .../TIKA-2273-no-icu4j-encoding-detector.xml       |  23 ++-
 .../TIKA-2273-non-detecting-params-bad-charset.xml |  18 +-
 .../tika/config/TIKA-2273-non-detecting-params.xml |  24 ++-
 .../TIKA-2273-parameterize-encoding-detector.xml   |  25 ++-
 .../test/java/org/apache/tika/bundle/BundleIT.java |   7 +-
 .../java/org/apache/tika/config/ServiceLoader.java |   4 +-
 .../java/org/apache/tika/config/TikaConfig.java    | 181 +++++++++++++++++--
 .../org/apache/tika/detect/AutoDetectReader.java   |  23 ++-
 .../tika/detect/CompositeEncodingDetector.java     |  92 ++++++++++
 .../tika/detect/DefaultEncodingDetector.java       |  53 ++++++
 .../org/apache/tika/detect/EncodingDetector.java   |   3 +-
 .../tika/detect/NonDetectingEncodingDetector.java  |  67 +++++++
 .../tika/extractor/EmbeddedDocumentUtil.java       |  67 +++++--
 .../parser/AbstractEncodingDetectorParser.java     |  64 +++++++
 .../java/org/apache/tika/parser/DefaultParser.java |  47 ++++-
 .../src/test/java/org/apache/tika/TikaTest.java    |  11 +-
 .../org/apache/tika/config/TikaConfigTest.java     |   4 +-
 .../org.apache.tika.detect.EncodingDetector        |   2 +-
 .../org/apache/tika/eval/AbstractProfiler.java     |  10 +-
 .../eval/tokens/AlphaIdeographFilterFactory.java   |  30 ++--
 .../tika/eval/tokens/AnalyzerDeserializer.java     |  15 +-
 .../apache/tika/eval/tokens/AnalyzerManager.java   |  19 +-
 .../tika/eval/tokens/CommonTokenCountManager.java  |  14 +-
 .../apache/tika/eval/tokens/CommonTokenResult.java |  21 ++-
 .../org/apache/tika/eval/tokens/TokenCounter.java  |  24 +--
 tika-eval/src/main/resources/lucene-analyzers.json |  39 ++--
 .../org/apache/tika/eval/AnalyzerManagerTest.java  |  32 +++-
 .../tika/eval/tokens/LuceneTokenCounter.java       |   8 +-
 .../apache/tika/eval/tokens/TokenCounterTest.java  |  19 +-
 .../apache/tika/parser/code/SourceCodeParser.java  |  26 +--
 .../org/apache/tika/parser/pdf/XFAExtractor.java   |  28 ++-
 .../java/org/apache/tika/parser/chm/ChmParser.java |   2 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |   9 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |  15 +-
 .../apache/tika/parser/envi/EnviHeaderParser.java  |  14 +-
 .../org/apache/tika/parser/isatab/ISATabUtils.java |  26 ++-
 .../tika/parser/txt/Icu4jEncodingDetector.java     |  23 +++
 .../java/org/apache/tika/parser/txt/TXTParser.java |  21 ++-
 .../org/apache/tika/parser/html/HtmlParser.java    |  20 ++-
 43 files changed, 1082 insertions(+), 285 deletions(-)
 create mode 100644 
tika-app/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
 copy 
tika-core/src/test/resources/org/apache/tika/config/TIKA-1700-unknown-parser.xml
 => 
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-blacklist-encoding-detector-default.xml
 (56%)
 copy 
tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
 => 
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml
 (56%)
 copy 
tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
 => 
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml
 (62%)
 copy 
tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
 => 
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params-bad-charset.xml
 (67%)
 copy 
tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
 => 
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params.xml
 (67%)
 copy 
tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
 => 
tika-app/src/test/resources/org/apache/tika/config/TIKA-2273-parameterize-encoding-detector.xml
 (63%)
 create mode 100644 
tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
 create mode 100644 
tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
 create mode 100644 
tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java
 create mode 100644 
tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java
 copy {tika-parser-modules/tika-parser-web-module/src/main => 
tika-core/src/test}/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 (93%)

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to