This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4671-lang-aware-charset-detection in repository https://gitbox.apache.org/repos/asf/tika.git
commit 006739ce4b9d7eb534a6b06b1fd10be5029139cf Author: tallison <[email protected]> AuthorDate: Thu Feb 19 08:05:30 2026 -0500 TIKA-4671 - tweaks --- .../langdetect/charsoup/CharSoupEncodingDetector.java | 2 +- .../langdetect/charsoup/CharSoupLanguageDetector.java | 2 +- .../apache/tika/config/TikaEncodingDetectorTest.java | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java index cb393cadf7..75176f69fc 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java @@ -54,7 +54,7 @@ import org.apache.tika.parser.ParseContext; * * @since Apache Tika 3.2 */ -@TikaComponent +@TikaComponent(name = "charsoup-encoding-detector") public class CharSoupEncodingDetector implements MetaEncodingDetector { private static final long serialVersionUID = 1L; diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java index 6e60e88447..31534f2e38 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java @@ -53,7 +53,7 @@ import org.apache.tika.language.detect.LanguageResult; * keeping the implementation simple and predictable. * </p> */ -@TikaComponent +@TikaComponent(name = "charsoup-language-detector") public class CharSoupLanguageDetector extends LanguageDetector { private static final Logger LOG = diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index bbd3caf272..2524ef404d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -271,6 +271,24 @@ public class TikaEncodingDetectorTest extends TikaTest { } + @Test + public void testExcludeCharSoupEncodingDetector() throws Exception { + TikaLoader tikaLoader = TikaLoaderHelper.getLoader( + "TIKA-4671-exclude-charsoup-encoding-detector.json"); + EncodingDetector detector = tikaLoader.loadEncodingDetectors(); + assertTrue(detector instanceof CompositeEncodingDetector); + List<EncodingDetector> detectors = + ((CompositeEncodingDetector) detector).getDetectors(); + // 3 base detectors, no MetaEncodingDetector + assertEquals(3, detectors.size()); + assertTrue(detectors.get(0) instanceof HtmlEncodingDetector); + assertTrue(detectors.get(1) instanceof UniversalEncodingDetector); + assertTrue(detectors.get(2) instanceof Icu4jEncodingDetector); + for (EncodingDetector d : detectors) { + assertNotContained("CharSoup", d.getClass().getSimpleName()); + } + } + @Test public void testArabicMisleadingCharsetHtml() throws Exception { // This HTML file is encoded in windows-1256 but declares charset=UTF-8
