(tika) 01/03: enrich short text discriminative and generative features

tallison Fri, 20 Mar 2026 14:20:49 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4692-script-block-features
in repository https://gitbox.apache.org/repos/asf/tika.git


commit d1c6bf14f1c02f57652cbb8a546cb6ae903ba13d
Author: tballison <[email protected]>
AuthorDate: Fri Mar 13 23:33:44 2026 -0400

    enrich short text discriminative and generative features
---
 .../tika/langdetect/charsoup/CharSoupModel.java    |  19 +++-
 .../charsoup/GenerativeLanguageModel.java          | 123 ++++++++++++++++++---
 .../charsoup/ScriptAwareFeatureExtractor.java      | 101 +++++++++++++++--
 .../tika/langdetect/charsoup/ScriptCategory.java   |  44 +++++++-
 .../charsoup/ShortTextFeatureExtractor.java        |  96 ++++++++++++++--
 .../charsoup/CharSoupLanguageDetector.java         |   2 +-
 .../charsoup/ScriptAwareFeatureExtractorTest.java  |  83 ++++++++++----
 .../charsoup/tools/ResearchFeatureExtractor.java   |  91 ++++++++++++++-
 8 files changed, 494 insertions(+), 65 deletions(-)

diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
index cbcf21512c..cef2b81b99 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
@@ -87,6 +87,8 @@ public class CharSoupModel {
     public static final int FLAG_4GRAMS        = 1 << 7;
     /** Feature flag: enable character 5-grams. */
     public static final int FLAG_5GRAMS        = 1 << 8;
+    /** Feature flag: enable sqrt-weighted script-block presence + transition 
features. */
+    public static final int FLAG_SCRIPT_BLOCKS = 1 << 9;
 
     /** Default flags for v1 models (word unigrams only). */
     public static final int V1_DEFAULT_FLAGS = FLAG_WORD_UNIGRAMS;
@@ -401,19 +403,28 @@ public class CharSoupModel {
      */
     public FeatureExtractor createExtractor() {
         if (featureFlags == ScriptAwareFeatureExtractor.FEATURE_FLAGS) {
-            return new ScriptAwareFeatureExtractor(numBuckets);
+            return new ScriptAwareFeatureExtractor(numBuckets, true);
+        }
+        if (featureFlags == ScriptAwareFeatureExtractor.FEATURE_FLAGS_LEGACY) {
+            return new ScriptAwareFeatureExtractor(numBuckets, false);
         }
         if (featureFlags == ShortTextFeatureExtractor.FEATURE_FLAGS) {
-            return new ShortTextFeatureExtractor(numBuckets);
+            return new ShortTextFeatureExtractor(numBuckets, true);
+        }
+        if (featureFlags == ShortTextFeatureExtractor.FEATURE_FLAGS_LEGACY) {
+            return new ShortTextFeatureExtractor(numBuckets, false);
         }
         throw new IllegalStateException(String.format(
                 Locale.ROOT,
                 "No production FeatureExtractor for featureFlags=0x%03x. "
-                + "Known: ScriptAware=0x%03x, ShortText=0x%03x. "
+                + "Known: ScriptAware=0x%03x, ScriptAwareLegacy=0x%03x, "
+                + "ShortText=0x%03x, ShortTextLegacy=0x%03x. "
                 + "Use ResearchFeatureExtractor (test scope) for experimental 
configs.",
                 featureFlags,
                 ScriptAwareFeatureExtractor.FEATURE_FLAGS,
-                ShortTextFeatureExtractor.FEATURE_FLAGS));
+                ScriptAwareFeatureExtractor.FEATURE_FLAGS_LEGACY,
+                ShortTextFeatureExtractor.FEATURE_FLAGS,
+                ShortTextFeatureExtractor.FEATURE_FLAGS_LEGACY));
     }
 
     public int getFeatureFlags() {
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
index ff228debaa..d8640e30b1 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
@@ -50,16 +50,17 @@ import java.util.Map;
  * <p>Log-probabilities are quantized to unsigned INT8 over the range
  * [{@link #LOGP_MIN}, 0] and stored in dense byte arrays.
  *
- * <h3>Binary format ({@code GLM1} v2)</h3>
+ * <h3>Binary format ({@code GLM1} v3)</h3>
  * <pre>
  *   INT  magic    = 0x474C4D31
- *   INT  version  = 2
+ *   INT  version  = 3
  *   INT  numLangs
  *   INT  cjkUnigramBuckets
  *   INT  cjkBigramBuckets
  *   INT  noncjkUnigramBuckets
  *   INT  noncjkBigramBuckets
  *   INT  noncjkTrigramBuckets
+ *   INT  scriptCategories          (v3+)
  *   For each language:
  *     SHORT  codeLen
  *     BYTES  langCode (UTF-8)
@@ -69,6 +70,7 @@ import java.util.Map;
  *     BYTES  unigramTable  [cjkUnigramBuckets | noncjkUnigramBuckets]
  *     BYTES  bigramTable   [cjkBigramBuckets  | noncjkBigramBuckets]
  *     BYTES  trigramTable  [noncjkTrigramBuckets] (absent for CJK)
+ *     BYTES  scriptTable   [scriptCategories]  (v3+)
  * </pre>
  */
 public class GenerativeLanguageModel {
@@ -81,6 +83,13 @@ public class GenerativeLanguageModel {
     public static final int NONCJK_BIGRAM_BUCKETS  =  8_192;
     public static final int NONCJK_TRIGRAM_BUCKETS = 16_384;
 
+    /**
+     * Number of script categories tracked for script-block features.
+     * Matches {@link ScriptCategory#COUNT} at model-build time; stored in the
+     * binary so older readers can skip it.
+     */
+    public static final int SCRIPT_CATEGORIES = ScriptCategory.COUNT;
+
     /** Default classpath resource path for the bundled generative model. */
     public static final String DEFAULT_MODEL_RESOURCE =
             
"/org/apache/tika/langdetect/charsoup/langdetect-generative-v1-20260310.bin";
@@ -92,7 +101,7 @@ public class GenerativeLanguageModel {
     public static final float LOGP_MIN = -18.0f;
 
     private static final int MAGIC   = 0x474C4D31; // "GLM1"
-    private static final int VERSION = 2;
+    private static final int VERSION = 3;
 
     // ---- FNV-1a basis constants ----
 
@@ -124,6 +133,7 @@ public class GenerativeLanguageModel {
     private final byte[][]   unigramTables;   // [langIdx][bucket]
     private final byte[][]   bigramTables;    // [langIdx][bucket]
     private final byte[][]   trigramTables;   // [langIdx][bucket]; null entry 
for CJK langs
+    private final byte[][]   scriptTables;    // [langIdx][SCRIPT_CATEGORIES]; 
null if v2 model
     private final float[]    scoreMeans;      // μ per language (from training 
data)
     private final float[]    scoreStdDevs;    // σ per language (from training 
data)
 
@@ -133,6 +143,7 @@ public class GenerativeLanguageModel {
             byte[][]     unigramTables,
             byte[][]     bigramTables,
             byte[][]     trigramTables,
+            byte[][]     scriptTables,
             float[]      scoreMeans,
             float[]      scoreStdDevs) {
         this.langIds       = Collections.unmodifiableList(new 
ArrayList<>(langIds));
@@ -140,6 +151,7 @@ public class GenerativeLanguageModel {
         this.unigramTables = unigramTables;
         this.bigramTables  = bigramTables;
         this.trigramTables = trigramTables;
+        this.scriptTables  = scriptTables;
         this.scoreMeans    = scoreMeans;
         this.scoreStdDevs  = scoreStdDevs;
         Map<String, Integer> idx = new HashMap<>(langIds.size() * 2);
@@ -213,9 +225,54 @@ public class GenerativeLanguageModel {
                 });
         }
 
+        if (scriptTables != null && scriptTables[li] != null) {
+            float scriptScore = scoreScriptDistribution(preprocessed, 
scriptTables[li]);
+            if (!Float.isNaN(scriptScore)) {
+                sum[0] += scriptScore;
+                cnt[0]++;
+            }
+        }
+
         return cnt[0] == 0 ? Float.NaN : (float) (sum[0] / cnt[0]);
     }
 
+    /**
+     * Compute a single L1-weighted average of script log-probs for the text.
+     * Returns NaN if the text contains no letter codepoints.
+     */
+    static float scoreScriptDistribution(String preprocessed, byte[] 
scriptTable) {
+        int[] scriptCounts = new int[SCRIPT_CATEGORIES];
+        int totalLetters = 0;
+
+        int i = 0;
+        int len = preprocessed.length();
+        while (i < len) {
+            int cp = preprocessed.codePointAt(i);
+            i += Character.charCount(cp);
+            if (Character.isLetter(cp)) {
+                int lower = Character.toLowerCase(cp);
+                int script = ScriptCategory.of(lower);
+                if (script < SCRIPT_CATEGORIES) {
+                    scriptCounts[script]++;
+                    totalLetters++;
+                }
+            }
+        }
+
+        if (totalLetters == 0) {
+            return Float.NaN;
+        }
+
+        double weightedSum = 0.0;
+        for (int s = 0; s < SCRIPT_CATEGORIES; s++) {
+            if (scriptCounts[s] > 0) {
+                double proportion = (double) scriptCounts[s] / totalLetters;
+                weightedSum += proportion * dequantize(scriptTable[s]);
+            }
+        }
+        return (float) weightedSum;
+    }
+
     /**
      * Score {@code text} against all languages and return the best match.
      *
@@ -538,10 +595,11 @@ public class GenerativeLanguageModel {
             throw new IOException("Not a GLM1 file (bad magic)");
         }
         int version = din.readInt();
-        if (version != 1 && version != VERSION) {
+        if (version < 1 || version > VERSION) {
             throw new IOException("Unsupported GLM version: " + version);
         }
-        boolean hasStats = version >= 2;
+        boolean hasStats  = version >= 2;
+        boolean hasScript = version >= 3;
 
         int numLangs        = din.readInt();
         int cjkUni          = din.readInt();
@@ -550,11 +608,14 @@ public class GenerativeLanguageModel {
         int noncjkBi        = din.readInt();
         int noncjkTri       = din.readInt();
 
+        int scriptCats = hasScript ? din.readInt() : 0;
+
         List<String> langIds      = new ArrayList<>(numLangs);
         boolean[]    isCjk        = new boolean[numLangs];
         byte[][]     unigramTables = new byte[numLangs][];
         byte[][]     bigramTables  = new byte[numLangs][];
         byte[][]     trigramTables = new byte[numLangs][];
+        byte[][]     scriptTbls    = hasScript ? new byte[numLangs][] : null;
         float[]      means        = new float[numLangs];
         float[]      stdDevs      = new float[numLangs];
 
@@ -584,10 +645,15 @@ public class GenerativeLanguageModel {
                 trigramTables[i] = new byte[noncjkTri];
                 din.readFully(trigramTables[i]);
             }
+
+            if (hasScript) {
+                scriptTbls[i] = new byte[scriptCats];
+                din.readFully(scriptTbls[i]);
+            }
         }
 
         return new GenerativeLanguageModel(langIds, isCjk,
-                unigramTables, bigramTables, trigramTables,
+                unigramTables, bigramTables, trigramTables, scriptTbls,
                 means, stdDevs);
     }
 
@@ -605,6 +671,7 @@ public class GenerativeLanguageModel {
         dout.writeInt(NONCJK_UNIGRAM_BUCKETS);
         dout.writeInt(NONCJK_BIGRAM_BUCKETS);
         dout.writeInt(NONCJK_TRIGRAM_BUCKETS);
+        dout.writeInt(SCRIPT_CATEGORIES);
 
         for (int i = 0; i < langIds.size(); i++) {
             byte[] codeBytes = langIds.get(i).getBytes(StandardCharsets.UTF_8);
@@ -618,6 +685,11 @@ public class GenerativeLanguageModel {
             if (!isCjk[i]) {
                 dout.write(trigramTables[i]);
             }
+            if (scriptTables != null && scriptTables[i] != null) {
+                dout.write(scriptTables[i]);
+            } else {
+                dout.write(new byte[SCRIPT_CATEGORIES]);
+            }
         }
         dout.flush();
     }
@@ -638,6 +710,7 @@ public class GenerativeLanguageModel {
         private final Map<String, long[]>  unigramCounts = new HashMap<>();
         private final Map<String, long[]>  bigramCounts  = new HashMap<>();
         private final Map<String, long[]>  trigramCounts = new HashMap<>();
+        private final Map<String, long[]>  scriptCounts  = new HashMap<>();
 
         /**
          * Register a language before feeding it samples.  Must be called
@@ -652,6 +725,7 @@ public class GenerativeLanguageModel {
             if (!isCjk) {
                 trigramCounts.put(langCode, new long[NONCJK_TRIGRAM_BUCKETS]);
             }
+            scriptCounts.put(langCode, new long[SCRIPT_CATEGORIES]);
             return this;
         }
 
@@ -683,6 +757,8 @@ public class GenerativeLanguageModel {
                         h -> bg[h % NONCJK_BIGRAM_BUCKETS]++,
                         h -> tg[h % NONCJK_TRIGRAM_BUCKETS]++);
             }
+
+            accumulateScriptCounts(pp, scriptCounts.get(langCode));
             return this;
         }
 
@@ -695,21 +771,24 @@ public class GenerativeLanguageModel {
             List<String> ids  = new ArrayList<>(cjkFlags.keySet());
             int n = ids.size();
 
-            boolean[] cjkArr    = new boolean[n];
-            byte[][]  uniTables = new byte[n][];
-            byte[][]  biTables  = new byte[n][];
-            byte[][]  triTables = new byte[n][];
+            boolean[] cjkArr      = new boolean[n];
+            byte[][]  uniTables   = new byte[n][];
+            byte[][]  biTables    = new byte[n][];
+            byte[][]  triTables   = new byte[n][];
+            byte[][]  scriptTbls  = new byte[n][];
 
             for (int i = 0; i < n; i++) {
                 String lang = ids.get(i);
-                cjkArr[i]  = cjkFlags.get(lang);
-                uniTables[i] = toLogProbTable(unigramCounts.get(lang), addK);
-                biTables[i]  = toLogProbTable(bigramCounts.get(lang),  addK);
+                cjkArr[i]     = cjkFlags.get(lang);
+                uniTables[i]  = toLogProbTable(unigramCounts.get(lang), addK);
+                biTables[i]   = toLogProbTable(bigramCounts.get(lang),  addK);
                 if (!cjkArr[i]) {
                     triTables[i] = toLogProbTable(trigramCounts.get(lang), 
addK);
                 }
+                scriptTbls[i] = toLogProbTable(scriptCounts.get(lang), addK);
             }
-            return new GenerativeLanguageModel(ids, cjkArr, uniTables, 
biTables, triTables,
+            return new GenerativeLanguageModel(ids, cjkArr,
+                    uniTables, biTables, triTables, scriptTbls,
                     new float[n], new float[n]);
         }
 
@@ -726,5 +805,21 @@ public class GenerativeLanguageModel {
             }
             return table;
         }
+
+        private static void accumulateScriptCounts(String preprocessed, long[] 
dest) {
+            int i = 0;
+            int len = preprocessed.length();
+            while (i < len) {
+                int cp = preprocessed.codePointAt(i);
+                i += Character.charCount(cp);
+                if (Character.isLetter(cp)) {
+                    int lower = Character.toLowerCase(cp);
+                    int script = ScriptCategory.of(lower);
+                    if (script < dest.length) {
+                        dest[script]++;
+                    }
+                }
+            }
+        }
     }
 }
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
index c97a6bdf6a..32dd1918cc 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
@@ -45,31 +45,48 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
      * model file always reflects the real inference-time feature set.
      */
     public static final int FEATURE_FLAGS =
+            CharSoupModel.FLAG_TRIGRAMS
+            | CharSoupModel.FLAG_SUFFIXES
+            | CharSoupModel.FLAG_PREFIX
+            | CharSoupModel.FLAG_WORD_UNIGRAMS
+            | CharSoupModel.FLAG_CHAR_UNIGRAMS
+            | CharSoupModel.FLAG_SCRIPT_BLOCKS;
+
+    /** Flags used by models trained before script block features were added. 
*/
+    public static final int FEATURE_FLAGS_LEGACY =
             CharSoupModel.FLAG_TRIGRAMS
             | CharSoupModel.FLAG_SUFFIXES
             | CharSoupModel.FLAG_PREFIX
             | CharSoupModel.FLAG_WORD_UNIGRAMS
             | CharSoupModel.FLAG_CHAR_UNIGRAMS;
 
-    static final int BIGRAM_BASIS  = 0x811c9dc5;
-    static final int TRIGRAM_BASIS = 0x9f4e3c21;
-    static final int UNIGRAM_BASIS = 0x2f4a3c17;
-    static final int WORD_BASIS    = 0x4a1c7b39;
-    static final int SUFFIX_BASIS  = 0x7e2b1a8f;
-    static final int PREFIX_BASIS  = 0x3b7e9f12;
+    static final int BIGRAM_BASIS       = 0x811c9dc5;
+    static final int TRIGRAM_BASIS      = 0x9f4e3c21;
+    static final int UNIGRAM_BASIS      = 0x2f4a3c17;
+    static final int WORD_BASIS         = 0x4a1c7b39;
+    static final int SUFFIX_BASIS       = 0x7e2b1a8f;
+    static final int PREFIX_BASIS       = 0x3b7e9f12;
+    public static final int SCRIPT_BASIS       = 0x5d8c2e71;
+    public static final int SCRIPT_TRANS_BASIS = 0x6f1a4b93;
 
     static final int MAX_WORD_LENGTH = 30;
     static final int MIN_WORD_LENGTH = 2;
     static final int SENTINEL = '_';
 
     private final int numBuckets;
+    private final boolean useScriptBlocks;
 
     public ScriptAwareFeatureExtractor(int numBuckets) {
+        this(numBuckets, true);
+    }
+
+    public ScriptAwareFeatureExtractor(int numBuckets, boolean 
useScriptBlocks) {
         if (numBuckets <= 0) {
             throw new IllegalArgumentException(
                     "numBuckets must be positive: " + numBuckets);
         }
         this.numBuckets = numBuckets;
+        this.useScriptBlocks = useScriptBlocks;
     }
 
     @Override
@@ -135,6 +152,11 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
         int suf0 = SENTINEL, suf1 = SENTINEL, suf2 = SENTINEL, suf3 = SENTINEL;
         int preA = SENTINEL, preB = SENTINEL, preC = SENTINEL;
 
+        int[] scriptCounts = useScriptBlocks ? new int[ScriptCategory.COUNT] : 
null;
+        int[] transitionCounts = useScriptBlocks
+                ? new int[ScriptCategory.COUNT * ScriptCategory.COUNT] : null;
+        int lastLetterScript = -1;
+
         int i = 0;
         int len = text.length();
         while (i < len) {
@@ -150,6 +172,14 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
                 int script = ScriptCategory.of(lower);
                 boolean cjk = isCjkScript(script);
 
+                if (useScriptBlocks) {
+                    scriptCounts[script]++;
+                    if (lastLetterScript >= 0 && lastLetterScript != script) {
+                        transitionCounts[lastLetterScript * 
ScriptCategory.COUNT + script]++;
+                    }
+                    lastLetterScript = script;
+                }
+
                 if (prevWasLetter) {
                     if (!sameFamily(script, prevScript)) {
                         emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
@@ -249,6 +279,10 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
                     wordHash, wordLen, wordScript,
                     suf0, suf1, suf2, suf3, preA, preB, preC);
         }
+
+        if (useScriptBlocks) {
+            emitScriptFeatures(counts, scriptCounts, transitionCounts);
+        }
     }
 
     private void emitBoundaryStart(int[] counts, int script, int lower, 
boolean cjk) {
@@ -310,10 +344,61 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
         }
     }
 
+    public static final int SCRIPT_SCALE = 100;
+
+    private void emitScriptFeatures(int[] counts,
+                                     int[] scriptCounts,
+                                     int[] transitionCounts) {
+        int totalLetters = 0;
+        for (int c : scriptCounts) {
+            totalLetters += c;
+        }
+        if (totalLetters == 0) {
+            return;
+        }
+
+        for (int s = 0; s < ScriptCategory.COUNT; s++) {
+            if (scriptCounts[s] > 0) {
+                int weight = (int) Math.round(
+                        (double) SCRIPT_SCALE * scriptCounts[s] / 
totalLetters);
+                if (weight > 0) {
+                    int h = fnvFeedByte(SCRIPT_BASIS, s);
+                    counts[(h & 0x7FFFFFFF) % numBuckets] += weight;
+                }
+            }
+        }
+
+        int totalTransitions = 0;
+        for (int c : transitionCounts) {
+            totalTransitions += c;
+        }
+        if (totalTransitions == 0) {
+            return;
+        }
+
+        for (int s = 0; s < ScriptCategory.COUNT; s++) {
+            for (int t = 0; t < ScriptCategory.COUNT; t++) {
+                int c = transitionCounts[s * ScriptCategory.COUNT + t];
+                if (c > 0) {
+                    int weight = (int) Math.round(
+                            (double) SCRIPT_SCALE * c / totalTransitions);
+                    if (weight > 0) {
+                        int h = fnvFeedByte(fnvFeedByte(SCRIPT_TRANS_BASIS, 
s), t);
+                        counts[(h & 0x7FFFFFFF) % numBuckets] += weight;
+                    }
+                }
+            }
+        }
+    }
+
     // ---- Script helpers ----
 
-    private static boolean isCjkScript(int script) {
+    public static boolean isCjkScript(int script) {
         return script == ScriptCategory.HAN
+                || script == ScriptCategory.HAN_EXT_A
+                || script == ScriptCategory.HAN_EXT_B
+                || script == ScriptCategory.HAN_COMPAT
+                || script == ScriptCategory.BOPOMOFO
                 || script == ScriptCategory.HIRAGANA
                 || script == ScriptCategory.KATAKANA;
     }
@@ -357,6 +442,6 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
 
     @Override
     public int getFeatureFlags() {
-        return FEATURE_FLAGS;
+        return useScriptBlocks ? FEATURE_FLAGS : FEATURE_FLAGS_LEGACY;
     }
 }
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
index cbd19d824f..37370e9e6e 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
@@ -56,14 +56,22 @@ public final class ScriptCategory {
     public static final int TIBETAN = 18;
     public static final int KHMER = 19;
 
+    // CJK sub-blocks — finer-grained Han categories for distinguishing
+    // Simplified Chinese, Traditional Chinese, Japanese kanji, and Korean 
hanja.
+    public static final int HAN_EXT_A  = 20;  // U+3400–U+4DBF
+    public static final int HAN_EXT_B  = 21;  // U+20000–U+2A6DF (and later 
extensions)
+    public static final int HAN_COMPAT = 22;  // U+F900–U+FAFF
+    public static final int BOPOMOFO   = 23;  // U+3100–U+312F, U+31A0–U+31BF
+
     /** Number of distinct categories. */
-    public static final int COUNT = 20;
+    public static final int COUNT = 24;
 
     private static final String[] NAMES = {
             "LATIN", "CYRILLIC", "ARABIC", "HAN", "HANGUL",
             "HIRAGANA", "KATAKANA", "DEVANAGARI", "THAI", "GREEK",
             "HEBREW", "BENGALI", "GEORGIAN", "ARMENIAN", "ETHIOPIC", "OTHER",
-            "CANADIAN_ABORIGINAL", "MYANMAR", "TIBETAN", "KHMER"
+            "CANADIAN_ABORIGINAL", "MYANMAR", "TIBETAN", "KHMER",
+            "HAN_EXT_A", "HAN_EXT_B", "HAN_COMPAT", "BOPOMOFO"
     };
 
     private ScriptCategory() {
@@ -84,10 +92,42 @@ public final class ScriptCategory {
         if (cp < 0x0080) {
             return LATIN;
         }
+
+        // Bopomofo (Traditional Chinese phonetic) — check before UnicodeScript
+        // because Java maps these to BOPOMOFO script, not HAN.
+        if ((cp >= 0x3100 && cp <= 0x312F) || (cp >= 0x31A0 && cp <= 0x31BF)) {
+            return BOPOMOFO;
+        }
+
         Character.UnicodeScript us = Character.UnicodeScript.of(cp);
+
+        // Sub-block routing for HAN codepoints
+        if (us == Character.UnicodeScript.HAN) {
+            return hanSubBlock(cp);
+        }
+
         return fromUnicodeScript(us);
     }
 
+    /**
+     * Route a HAN codepoint to a sub-block category.
+     * The common CJK Unified Ideographs block (U+4E00–U+9FFF) maps to {@link 
#HAN}.
+     * Rarer blocks that correlate with Traditional Chinese or variant forms
+     * get their own category for finer-grained language discrimination.
+     */
+    static int hanSubBlock(int cp) {
+        if (cp >= 0x3400 && cp <= 0x4DBF) {
+            return HAN_EXT_A;
+        }
+        if (cp >= 0xF900 && cp <= 0xFAFF) {
+            return HAN_COMPAT;
+        }
+        if (cp >= 0x20000) {
+            return HAN_EXT_B;
+        }
+        return HAN;
+    }
+
     /**
      * Map a {@link Character.UnicodeScript} to a category.
      */
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
index 912461d54c..234d97e2b9 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
@@ -42,28 +42,42 @@ public class ShortTextFeatureExtractor implements 
FeatureExtractor {
      * describes the features this extractor emits.
      */
     public static final int FEATURE_FLAGS =
+            CharSoupModel.FLAG_TRIGRAMS
+            | CharSoupModel.FLAG_WORD_UNIGRAMS
+            | CharSoupModel.FLAG_4GRAMS
+            | CharSoupModel.FLAG_SCRIPT_BLOCKS;
+
+    public static final int FEATURE_FLAGS_LEGACY =
             CharSoupModel.FLAG_TRIGRAMS
             | CharSoupModel.FLAG_WORD_UNIGRAMS
             | CharSoupModel.FLAG_4GRAMS;
 
-    static final int BIGRAM_BASIS   = 0x811c9dc5;
-    static final int TRIGRAM_BASIS  = 0x9f4e3c21;
-    static final int FOURGRAM_BASIS = 0xa3d8f215;
-    static final int UNIGRAM_BASIS  = 0x2f4a3c17;
-    static final int WORD_BASIS     = 0x4a1c7b39;
+    static final int BIGRAM_BASIS       = 0x811c9dc5;
+    static final int TRIGRAM_BASIS      = 0x9f4e3c21;
+    static final int FOURGRAM_BASIS     = 0xa3d8f215;
+    static final int UNIGRAM_BASIS      = 0x2f4a3c17;
+    static final int WORD_BASIS         = 0x4a1c7b39;
+    static final int SCRIPT_BASIS       = 
ScriptAwareFeatureExtractor.SCRIPT_BASIS;
+    static final int SCRIPT_TRANS_BASIS = 
ScriptAwareFeatureExtractor.SCRIPT_TRANS_BASIS;
 
     static final int MAX_WORD_LENGTH = 30;
     static final int MIN_WORD_LENGTH = 2;
     static final int SENTINEL = '_';
 
     private final int numBuckets;
+    private final boolean useScriptBlocks;
 
     public ShortTextFeatureExtractor(int numBuckets) {
+        this(numBuckets, true);
+    }
+
+    public ShortTextFeatureExtractor(int numBuckets, boolean useScriptBlocks) {
         if (numBuckets <= 0) {
             throw new IllegalArgumentException(
                     "numBuckets must be positive: " + numBuckets);
         }
         this.numBuckets = numBuckets;
+        this.useScriptBlocks = useScriptBlocks;
     }
 
     @Override
@@ -127,14 +141,17 @@ public class ShortTextFeatureExtractor implements 
FeatureExtractor {
         int wordHash = WORD_BASIS;
         int wordLen = 0;
         int wordScript = -1;
-        // rolling suffix window for 4-gram boundary emissions
         int suf1 = SENTINEL;
         int suf2 = SENTINEL;
         int suf3 = SENTINEL;
-        // prefix window for boundary 4-gram at word start
         int preA = SENTINEL;
         int preB = SENTINEL;
 
+        int[] scriptCounts = useScriptBlocks ? new int[ScriptCategory.COUNT] : 
null;
+        int[] transitionCounts = useScriptBlocks
+                ? new int[ScriptCategory.COUNT * ScriptCategory.COUNT] : null;
+        int lastLetterScript = -1;
+
         int i = 0;
         int len = text.length();
         while (i < len) {
@@ -150,6 +167,14 @@ public class ShortTextFeatureExtractor implements 
FeatureExtractor {
                 int script = ScriptCategory.of(lower);
                 boolean cjk = isCjkScript(script);
 
+                if (useScriptBlocks) {
+                    scriptCounts[script]++;
+                    if (lastLetterScript >= 0 && lastLetterScript != script) {
+                        transitionCounts[lastLetterScript * 
ScriptCategory.COUNT + script]++;
+                    }
+                    lastLetterScript = script;
+                }
+
                 if (prevWasLetter) {
                     if (!sameFamily(script, prevScript)) {
                         emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
@@ -247,6 +272,10 @@ public class ShortTextFeatureExtractor implements 
FeatureExtractor {
             emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
                     wordHash, wordLen, wordScript, suf1, suf2, suf3);
         }
+
+        if (useScriptBlocks) {
+            emitScriptFeatures(counts, scriptCounts, transitionCounts);
+        }
     }
 
     private void emitBoundaryStart(int[] counts, int script, int lower, 
boolean cjk) {
@@ -302,10 +331,55 @@ public class ShortTextFeatureExtractor implements 
FeatureExtractor {
         counts[(h & 0x7FFFFFFF) % numBuckets]++;
     }
 
+    static final int SCRIPT_SCALE = ScriptAwareFeatureExtractor.SCRIPT_SCALE;
+
+    private void emitScriptFeatures(int[] counts,
+                                     int[] scriptCounts,
+                                     int[] transitionCounts) {
+        int totalLetters = 0;
+        for (int c : scriptCounts) {
+            totalLetters += c;
+        }
+        if (totalLetters == 0) {
+            return;
+        }
+
+        for (int s = 0; s < ScriptCategory.COUNT; s++) {
+            if (scriptCounts[s] > 0) {
+                int weight = (int) Math.round(
+                        (double) SCRIPT_SCALE * scriptCounts[s] / 
totalLetters);
+                if (weight > 0) {
+                    int h = fnvFeedByte(SCRIPT_BASIS, s);
+                    counts[(h & 0x7FFFFFFF) % numBuckets] += weight;
+                }
+            }
+        }
+
+        int totalTransitions = 0;
+        for (int c : transitionCounts) {
+            totalTransitions += c;
+        }
+        if (totalTransitions == 0) {
+            return;
+        }
+
+        for (int s = 0; s < ScriptCategory.COUNT; s++) {
+            for (int t = 0; t < ScriptCategory.COUNT; t++) {
+                int c = transitionCounts[s * ScriptCategory.COUNT + t];
+                if (c > 0) {
+                    int weight = (int) Math.round(
+                            (double) SCRIPT_SCALE * c / totalTransitions);
+                    if (weight > 0) {
+                        int h = fnvFeedByte(fnvFeedByte(SCRIPT_TRANS_BASIS, 
s), t);
+                        counts[(h & 0x7FFFFFFF) % numBuckets] += weight;
+                    }
+                }
+            }
+        }
+    }
+
     private static boolean isCjkScript(int script) {
-        return script == ScriptCategory.HAN
-                || script == ScriptCategory.HIRAGANA
-                || script == ScriptCategory.KATAKANA;
+        return ScriptAwareFeatureExtractor.isCjkScript(script);
     }
 
     private static boolean sameFamily(int a, int b) {
@@ -343,6 +417,6 @@ public class ShortTextFeatureExtractor implements 
FeatureExtractor {
 
     @Override
     public int getFeatureFlags() {
-        return FEATURE_FLAGS;
+        return useScriptBlocks ? FEATURE_FLAGS : FEATURE_FLAGS_LEGACY;
     }
 }
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
index 99962e60ab..5bd32d7f8a 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
@@ -210,7 +210,7 @@ public class CharSoupLanguageDetector extends 
LanguageDetector implements SelfCo
         FeatureExtractor shortExtractor = null;
         try {
             shortModel = 
CharSoupModel.loadFromClasspath(SHORT_TEXT_MODEL_RESOURCE);
-            shortExtractor = new 
ShortTextFeatureExtractor(shortModel.getNumBuckets());
+            shortExtractor = shortModel.createExtractor();
             verifyFlagsMatch(shortModel, shortExtractor, 
SHORT_TEXT_MODEL_RESOURCE);
             SHORT_TEXT_GROUP_INDICES = buildGroupIndices(shortModel);
             SHORT_TEXT_CLASS_SCRIPT = buildClassScript(shortModel);
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java
index 7828a59eef..fe1164d756 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java
@@ -29,12 +29,15 @@ public class ScriptAwareFeatureExtractorTest {
 
     private static final int NUM_BUCKETS = 8192;
 
+    private static ScriptAwareFeatureExtractor ngramOnly() {
+        return new ScriptAwareFeatureExtractor(NUM_BUCKETS, false);
+    }
+
     // ---- Basic sanity ----
 
     @Test
     public void testEmptyAndNull() {
-        ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+        ScriptAwareFeatureExtractor ext = ngramOnly();
         int[] counts = ext.extract(null);
         assertEquals(NUM_BUCKETS, counts.length);
         assertEquals(0, sum(counts));
@@ -45,8 +48,7 @@ public class ScriptAwareFeatureExtractorTest {
 
     @Test
     public void testSingleWord() {
-        ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+        ScriptAwareFeatureExtractor ext = ngramOnly();
         int[] counts = ext.extract("hello");
         // "hello" (production config: bigrams + trigrams + suffix + prefix + 
word):
         // bigrams:  (_,h) (h,e) (e,l) (l,l) (l,o) (o,_) = 6
@@ -61,7 +63,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testCjkUnigrams() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "中文": no sentinels for CJK
         // bigrams: (中,文) = 1
         // unigrams: 中, 文 = 2
@@ -73,7 +75,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testHiraganaUnigrams() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "あい": no sentinels for kana
         // bigrams: (あ,い) = 1
         // unigrams: あ, い = 2
@@ -85,7 +87,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testKatakanaUnigrams() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "アイ": same as hiragana
         int[] counts = ext.extract("アイ");
         assertEquals(3, sum(counts));
@@ -96,7 +98,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testCjkSpaceBridging() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "中 文" with space should produce same features as "中文"
         // The space is bridged for CJK
         int[] withSpace = ext.extract("中 文");
@@ -110,7 +112,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testCjkPunctuationBreaks() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "中。文" — punctuation IS a real break
         int[] withPunct = ext.extract("中。文");
         int[] noSpace = ext.extract("中文");
@@ -131,7 +133,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testLatinAndCyrillicDontCollide() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         int[] latin = ext.extract("ab");
         int[] cyrillic = ext.extract("аб");
         assertNotEquals(0, sum(latin));
@@ -152,7 +154,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testJapaneseScriptFamilyNoBoundary() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "漢あア" — Han + Hiragana + Katakana
         // All are CJK family, so no boundary between them.
         // bigrams: (漢,あ) (あ,ア) = 2
@@ -165,7 +167,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testJapaneseVsLatinCreatesBoundary() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "漢a" — Han then Latin: different family → boundary
         // Han part: (漢) = 1 unigram (no sentinels for CJK)
         // Latin part: (_,a) (a,_) = 2 bigrams (sentinels)
@@ -177,7 +179,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testHanHiraganaBigramChain() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "食べる" — Han(食) Hiragana(べ) Hiragana(る)
         // bigrams: (食,べ) (べ,る) = 2
         // unigrams: 食, べ, る = 3
@@ -191,7 +193,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testScriptChangeCreatesBoundary() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "abаб" — Latin "ab" followed by Cyrillic "аб"
         int[] mixed = ext.extract("abаб");
 
@@ -213,7 +215,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testWordUnigrams() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "abc" (production config):
         // bigrams:  (_,a) (a,b) (b,c) (c,_) = 4
         // trigrams: (_,a,b) (a,b,c) (b,c,_) = 3
@@ -228,7 +230,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testSingleCharWordNoWordUnigram() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         // "a" — single char word: bigrams only, no trigram/suffix/prefix/word 
unigram
         // bigrams: (_,a) (a,_) = 2
         // total = 2
@@ -241,7 +243,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testArabicDiacriticsTransparent() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         int[] plain = ext.extract("كتب");
         int[] diacritics = ext.extract("كَتَبَ");
         for (int i = 0; i < NUM_BUCKETS; i++) {
@@ -255,7 +257,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testExtractFromPreprocessed() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         String raw = "Hello https://example.com world";
         String preprocessed =
                 CharSoupFeatureExtractor.preprocess(raw);
@@ -270,7 +272,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testExtractFromPreprocessedAccumulate() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         int[] counts = ext.extract("hello");
         int sum1 = sum(counts);
         String preprocessed =
@@ -378,7 +380,7 @@ public class ScriptAwareFeatureExtractorTest {
     @RepeatedTest(10)
     public void testRandomSurrogatePairsAndEdgeCases() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
 
         String[] pathological = {
                 new String(new char[]{0xD800, 0xD801, 0xD802}),
@@ -410,7 +412,7 @@ public class ScriptAwareFeatureExtractorTest {
     @Test
     public void testDeterministic() {
         ScriptAwareFeatureExtractor ext =
-                new ScriptAwareFeatureExtractor(NUM_BUCKETS);
+                ngramOnly();
         String text =
                 "The quick brown fox 快速的棕色狐狸 прыгнул через";
         int[] first = ext.extract(text);
@@ -421,6 +423,45 @@ public class ScriptAwareFeatureExtractorTest {
         }
     }
 
+    // ---- Script block features ----
+
+    @Test
+    public void testScriptBlocksAddWeight() {
+        ScriptAwareFeatureExtractor withBlocks =
+                new ScriptAwareFeatureExtractor(NUM_BUCKETS, true);
+        int[] counts = withBlocks.extract("hello");
+        // n-gram features = 14 (same as testSingleWord)
+        // + script presence: 100% LATIN → weight 100
+        // total = 114
+        assertEquals(114, sum(counts));
+    }
+
+    @Test
+    public void testScriptBlocksMixedScript() {
+        ScriptAwareFeatureExtractor withBlocks =
+                new ScriptAwareFeatureExtractor(NUM_BUCKETS, true);
+        int[] withScript = withBlocks.extract("hello世界");
+        int[] noScript   = ngramOnly().extract("hello世界");
+        // Script features add presence weights (L1-normalized to 100)
+        // plus at least one transition (LATIN→HAN)
+        assertTrue(sum(withScript) > sum(noScript),
+                "Script block features should add weight");
+        int scriptContribution = sum(withScript) - sum(noScript);
+        // Presence: ~71% LATIN + ~29% HAN = 100 total presence
+        // Transition: 100% LATIN→HAN = 100 total transition
+        // Sum should be 200
+        assertEquals(200, scriptContribution);
+    }
+
+    @Test
+    public void testScriptBlocksDisabled() {
+        int[] withBlocks = new ScriptAwareFeatureExtractor(NUM_BUCKETS, true)
+                .extract("hello");
+        int[] withoutBlocks = ngramOnly().extract("hello");
+        assertTrue(sum(withBlocks) > sum(withoutBlocks));
+        assertEquals(14, sum(withoutBlocks));
+    }
+
     // ---- Helpers ----
 
     private int sum(int[] arr) {
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
index f7713abcb0..56676b1c3f 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
@@ -21,6 +21,7 @@ import java.util.Arrays;
 import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor;
 import org.apache.tika.langdetect.charsoup.CharSoupModel;
 import org.apache.tika.langdetect.charsoup.FeatureExtractor;
+import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor;
 import org.apache.tika.langdetect.charsoup.ScriptCategory;
 
 /**
@@ -47,10 +48,13 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
     static final int CHAR_UNIGRAM_BASIS = 0x1d4f8c3a;
     static final int FOURGRAM_BASIS     = 0xa3d8f215;
     static final int FIVEGRAM_BASIS     = 0xc7b46e38;
+    static final int SCRIPT_BASIS       = 
ScriptAwareFeatureExtractor.SCRIPT_BASIS;
+    static final int SCRIPT_TRANS_BASIS = 
ScriptAwareFeatureExtractor.SCRIPT_TRANS_BASIS;
 
     static final int MAX_WORD_LENGTH = 30;
     static final int MIN_WORD_LENGTH = 2;
     static final int SENTINEL = '_';
+    static final int SCRIPT_SCALE = ScriptAwareFeatureExtractor.SCRIPT_SCALE;
 
     private final int numBuckets;
     private final boolean useTrigrams;
@@ -62,10 +66,11 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
     private final boolean useCharUnigrams;
     private final boolean use4grams;
     private final boolean use5grams;
+    private final boolean useScriptBlocks;
 
     /** Minimal constructor: bigrams + word unigrams + CJK unigrams. */
     public ResearchFeatureExtractor(int numBuckets) {
-        this(numBuckets, false, false, false, false, false, true, false, 
false, false);
+        this(numBuckets, false, false, false, false, false, true, false, 
false, false, false);
     }
 
     /** Full-config constructor. All features share the same flat bucket 
space. */
@@ -79,6 +84,22 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
                                     boolean useCharUnigrams,
                                     boolean use4grams,
                                     boolean use5grams) {
+        this(numBuckets, useTrigrams, useSkipBigrams, useSuffixes, useSuffix4,
+                usePrefix, useWordUnigrams, useCharUnigrams, use4grams, 
use5grams, false);
+    }
+
+    /** Full-config constructor including script block features. */
+    public ResearchFeatureExtractor(int numBuckets,
+                                    boolean useTrigrams,
+                                    boolean useSkipBigrams,
+                                    boolean useSuffixes,
+                                    boolean useSuffix4,
+                                    boolean usePrefix,
+                                    boolean useWordUnigrams,
+                                    boolean useCharUnigrams,
+                                    boolean use4grams,
+                                    boolean use5grams,
+                                    boolean useScriptBlocks) {
         if (numBuckets <= 0) {
             throw new IllegalArgumentException(
                     "numBuckets must be positive: " + numBuckets);
@@ -93,6 +114,7 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
         this.useCharUnigrams = useCharUnigrams;
         this.use4grams = use4grams;
         this.use5grams = use5grams;
+        this.useScriptBlocks = useScriptBlocks;
     }
 
     @Override
@@ -155,6 +177,11 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
         int preB = SENTINEL;
         int preC = SENTINEL;
 
+        int[] scriptCounts = useScriptBlocks ? new int[ScriptCategory.COUNT] : 
null;
+        int[] transitionCounts = useScriptBlocks
+                ? new int[ScriptCategory.COUNT * ScriptCategory.COUNT] : null;
+        int lastLetterScript = -1;
+
         int i = 0;
         int len = text.length();
         while (i < len) {
@@ -170,6 +197,14 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
                 int script = ScriptCategory.of(lower);
                 boolean cjk = isCjkScript(script);
 
+                if (useScriptBlocks) {
+                    scriptCounts[script]++;
+                    if (lastLetterScript >= 0 && lastLetterScript != script) {
+                        transitionCounts[lastLetterScript * 
ScriptCategory.COUNT + script]++;
+                    }
+                    lastLetterScript = script;
+                }
+
                 if (prevWasLetter) {
                     if (!sameFamily(script, prevScript)) {
                         emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk,
@@ -299,6 +334,55 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
                     wordHash, wordLen, wordScript,
                     suf0, suf1, suf2, suf3, preA, preB, preC);
         }
+
+        if (useScriptBlocks) {
+            emitScriptFeatures(counts, scriptCounts, transitionCounts);
+        }
+    }
+
+    private void emitScriptFeatures(int[] counts,
+                                     int[] scriptCounts,
+                                     int[] transitionCounts) {
+        int totalLetters = 0;
+        for (int c : scriptCounts) {
+            totalLetters += c;
+        }
+        if (totalLetters == 0) {
+            return;
+        }
+
+        for (int s = 0; s < ScriptCategory.COUNT; s++) {
+            if (scriptCounts[s] > 0) {
+                int weight = (int) Math.round(
+                        (double) SCRIPT_SCALE * scriptCounts[s] / 
totalLetters);
+                if (weight > 0) {
+                    int h = fnvFeedByte(SCRIPT_BASIS, s);
+                    counts[(h & 0x7FFFFFFF) % numBuckets] += weight;
+                }
+            }
+        }
+
+        int totalTransitions = 0;
+        for (int c : transitionCounts) {
+            totalTransitions += c;
+        }
+        if (totalTransitions == 0) {
+            return;
+        }
+
+        for (int s = 0; s < ScriptCategory.COUNT; s++) {
+            for (int t = 0; t < ScriptCategory.COUNT; t++) {
+                int c = transitionCounts[s * ScriptCategory.COUNT + t];
+                if (c > 0) {
+                    int weight = (int) Math.round(
+                            (double) SCRIPT_SCALE * c / totalTransitions);
+                    if (weight > 0) {
+                        int h = fnvFeedByte(fnvFeedByte(SCRIPT_TRANS_BASIS, 
s), t);
+                        counts[(h & 0x7FFFFFFF) % numBuckets] += weight;
+                    }
+                }
+            }
+        }
     }
 
     private void emitBoundaryStart(int[] counts, int script, int lower, 
boolean cjk) {
@@ -402,9 +486,7 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
     }
 
     private static boolean isCjkScript(int script) {
-        return script == ScriptCategory.HAN
-                || script == ScriptCategory.HIRAGANA
-                || script == ScriptCategory.KATAKANA;
+        return ScriptAwareFeatureExtractor.isCjkScript(script);
     }
 
     private static boolean sameFamily(int a, int b) {
@@ -452,6 +534,7 @@ public class ResearchFeatureExtractor implements 
FeatureExtractor {
         if (useCharUnigrams) flags |= CharSoupModel.FLAG_CHAR_UNIGRAMS;
         if (use4grams)       flags |= CharSoupModel.FLAG_4GRAMS;
         if (use5grams)       flags |= CharSoupModel.FLAG_5GRAMS;
+        if (useScriptBlocks) flags |= CharSoupModel.FLAG_SCRIPT_BLOCKS;
         return flags;
     }
 }

(tika) 01/03: enrich short text discriminative and generative features

Reply via email to