This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-script-block-features in repository https://gitbox.apache.org/repos/asf/tika.git
commit d1c6bf14f1c02f57652cbb8a546cb6ae903ba13d Author: tballison <[email protected]> AuthorDate: Fri Mar 13 23:33:44 2026 -0400 enrich short text discriminative and generative features --- .../tika/langdetect/charsoup/CharSoupModel.java | 19 +++- .../charsoup/GenerativeLanguageModel.java | 123 ++++++++++++++++++--- .../charsoup/ScriptAwareFeatureExtractor.java | 101 +++++++++++++++-- .../tika/langdetect/charsoup/ScriptCategory.java | 44 +++++++- .../charsoup/ShortTextFeatureExtractor.java | 96 ++++++++++++++-- .../charsoup/CharSoupLanguageDetector.java | 2 +- .../charsoup/ScriptAwareFeatureExtractorTest.java | 83 ++++++++++---- .../charsoup/tools/ResearchFeatureExtractor.java | 91 ++++++++++++++- 8 files changed, 494 insertions(+), 65 deletions(-) diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java index cbcf21512c..cef2b81b99 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java @@ -87,6 +87,8 @@ public class CharSoupModel { public static final int FLAG_4GRAMS = 1 << 7; /** Feature flag: enable character 5-grams. */ public static final int FLAG_5GRAMS = 1 << 8; + /** Feature flag: enable sqrt-weighted script-block presence + transition features. */ + public static final int FLAG_SCRIPT_BLOCKS = 1 << 9; /** Default flags for v1 models (word unigrams only). */ public static final int V1_DEFAULT_FLAGS = FLAG_WORD_UNIGRAMS; @@ -401,19 +403,28 @@ public class CharSoupModel { */ public FeatureExtractor createExtractor() { if (featureFlags == ScriptAwareFeatureExtractor.FEATURE_FLAGS) { - return new ScriptAwareFeatureExtractor(numBuckets); + return new ScriptAwareFeatureExtractor(numBuckets, true); + } + if (featureFlags == ScriptAwareFeatureExtractor.FEATURE_FLAGS_LEGACY) { + return new ScriptAwareFeatureExtractor(numBuckets, false); } if (featureFlags == ShortTextFeatureExtractor.FEATURE_FLAGS) { - return new ShortTextFeatureExtractor(numBuckets); + return new ShortTextFeatureExtractor(numBuckets, true); + } + if (featureFlags == ShortTextFeatureExtractor.FEATURE_FLAGS_LEGACY) { + return new ShortTextFeatureExtractor(numBuckets, false); } throw new IllegalStateException(String.format( Locale.ROOT, "No production FeatureExtractor for featureFlags=0x%03x. " - + "Known: ScriptAware=0x%03x, ShortText=0x%03x. " + + "Known: ScriptAware=0x%03x, ScriptAwareLegacy=0x%03x, " + + "ShortText=0x%03x, ShortTextLegacy=0x%03x. " + "Use ResearchFeatureExtractor (test scope) for experimental configs.", featureFlags, ScriptAwareFeatureExtractor.FEATURE_FLAGS, - ShortTextFeatureExtractor.FEATURE_FLAGS)); + ScriptAwareFeatureExtractor.FEATURE_FLAGS_LEGACY, + ShortTextFeatureExtractor.FEATURE_FLAGS, + ShortTextFeatureExtractor.FEATURE_FLAGS_LEGACY)); } public int getFeatureFlags() { diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java index ff228debaa..d8640e30b1 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java @@ -50,16 +50,17 @@ import java.util.Map; * <p>Log-probabilities are quantized to unsigned INT8 over the range * [{@link #LOGP_MIN}, 0] and stored in dense byte arrays. * - * <h3>Binary format ({@code GLM1} v2)</h3> + * <h3>Binary format ({@code GLM1} v3)</h3> * <pre> * INT magic = 0x474C4D31 - * INT version = 2 + * INT version = 3 * INT numLangs * INT cjkUnigramBuckets * INT cjkBigramBuckets * INT noncjkUnigramBuckets * INT noncjkBigramBuckets * INT noncjkTrigramBuckets + * INT scriptCategories (v3+) * For each language: * SHORT codeLen * BYTES langCode (UTF-8) @@ -69,6 +70,7 @@ import java.util.Map; * BYTES unigramTable [cjkUnigramBuckets | noncjkUnigramBuckets] * BYTES bigramTable [cjkBigramBuckets | noncjkBigramBuckets] * BYTES trigramTable [noncjkTrigramBuckets] (absent for CJK) + * BYTES scriptTable [scriptCategories] (v3+) * </pre> */ public class GenerativeLanguageModel { @@ -81,6 +83,13 @@ public class GenerativeLanguageModel { public static final int NONCJK_BIGRAM_BUCKETS = 8_192; public static final int NONCJK_TRIGRAM_BUCKETS = 16_384; + /** + * Number of script categories tracked for script-block features. + * Matches {@link ScriptCategory#COUNT} at model-build time; stored in the + * binary so older readers can skip it. + */ + public static final int SCRIPT_CATEGORIES = ScriptCategory.COUNT; + /** Default classpath resource path for the bundled generative model. */ public static final String DEFAULT_MODEL_RESOURCE = "/org/apache/tika/langdetect/charsoup/langdetect-generative-v1-20260310.bin"; @@ -92,7 +101,7 @@ public class GenerativeLanguageModel { public static final float LOGP_MIN = -18.0f; private static final int MAGIC = 0x474C4D31; // "GLM1" - private static final int VERSION = 2; + private static final int VERSION = 3; // ---- FNV-1a basis constants ---- @@ -124,6 +133,7 @@ public class GenerativeLanguageModel { private final byte[][] unigramTables; // [langIdx][bucket] private final byte[][] bigramTables; // [langIdx][bucket] private final byte[][] trigramTables; // [langIdx][bucket]; null entry for CJK langs + private final byte[][] scriptTables; // [langIdx][SCRIPT_CATEGORIES]; null if v2 model private final float[] scoreMeans; // μ per language (from training data) private final float[] scoreStdDevs; // σ per language (from training data) @@ -133,6 +143,7 @@ public class GenerativeLanguageModel { byte[][] unigramTables, byte[][] bigramTables, byte[][] trigramTables, + byte[][] scriptTables, float[] scoreMeans, float[] scoreStdDevs) { this.langIds = Collections.unmodifiableList(new ArrayList<>(langIds)); @@ -140,6 +151,7 @@ public class GenerativeLanguageModel { this.unigramTables = unigramTables; this.bigramTables = bigramTables; this.trigramTables = trigramTables; + this.scriptTables = scriptTables; this.scoreMeans = scoreMeans; this.scoreStdDevs = scoreStdDevs; Map<String, Integer> idx = new HashMap<>(langIds.size() * 2); @@ -213,9 +225,54 @@ public class GenerativeLanguageModel { }); } + if (scriptTables != null && scriptTables[li] != null) { + float scriptScore = scoreScriptDistribution(preprocessed, scriptTables[li]); + if (!Float.isNaN(scriptScore)) { + sum[0] += scriptScore; + cnt[0]++; + } + } + return cnt[0] == 0 ? Float.NaN : (float) (sum[0] / cnt[0]); } + /** + * Compute a single L1-weighted average of script log-probs for the text. + * Returns NaN if the text contains no letter codepoints. + */ + static float scoreScriptDistribution(String preprocessed, byte[] scriptTable) { + int[] scriptCounts = new int[SCRIPT_CATEGORIES]; + int totalLetters = 0; + + int i = 0; + int len = preprocessed.length(); + while (i < len) { + int cp = preprocessed.codePointAt(i); + i += Character.charCount(cp); + if (Character.isLetter(cp)) { + int lower = Character.toLowerCase(cp); + int script = ScriptCategory.of(lower); + if (script < SCRIPT_CATEGORIES) { + scriptCounts[script]++; + totalLetters++; + } + } + } + + if (totalLetters == 0) { + return Float.NaN; + } + + double weightedSum = 0.0; + for (int s = 0; s < SCRIPT_CATEGORIES; s++) { + if (scriptCounts[s] > 0) { + double proportion = (double) scriptCounts[s] / totalLetters; + weightedSum += proportion * dequantize(scriptTable[s]); + } + } + return (float) weightedSum; + } + /** * Score {@code text} against all languages and return the best match. * @@ -538,10 +595,11 @@ public class GenerativeLanguageModel { throw new IOException("Not a GLM1 file (bad magic)"); } int version = din.readInt(); - if (version != 1 && version != VERSION) { + if (version < 1 || version > VERSION) { throw new IOException("Unsupported GLM version: " + version); } - boolean hasStats = version >= 2; + boolean hasStats = version >= 2; + boolean hasScript = version >= 3; int numLangs = din.readInt(); int cjkUni = din.readInt(); @@ -550,11 +608,14 @@ public class GenerativeLanguageModel { int noncjkBi = din.readInt(); int noncjkTri = din.readInt(); + int scriptCats = hasScript ? din.readInt() : 0; + List<String> langIds = new ArrayList<>(numLangs); boolean[] isCjk = new boolean[numLangs]; byte[][] unigramTables = new byte[numLangs][]; byte[][] bigramTables = new byte[numLangs][]; byte[][] trigramTables = new byte[numLangs][]; + byte[][] scriptTbls = hasScript ? new byte[numLangs][] : null; float[] means = new float[numLangs]; float[] stdDevs = new float[numLangs]; @@ -584,10 +645,15 @@ public class GenerativeLanguageModel { trigramTables[i] = new byte[noncjkTri]; din.readFully(trigramTables[i]); } + + if (hasScript) { + scriptTbls[i] = new byte[scriptCats]; + din.readFully(scriptTbls[i]); + } } return new GenerativeLanguageModel(langIds, isCjk, - unigramTables, bigramTables, trigramTables, + unigramTables, bigramTables, trigramTables, scriptTbls, means, stdDevs); } @@ -605,6 +671,7 @@ public class GenerativeLanguageModel { dout.writeInt(NONCJK_UNIGRAM_BUCKETS); dout.writeInt(NONCJK_BIGRAM_BUCKETS); dout.writeInt(NONCJK_TRIGRAM_BUCKETS); + dout.writeInt(SCRIPT_CATEGORIES); for (int i = 0; i < langIds.size(); i++) { byte[] codeBytes = langIds.get(i).getBytes(StandardCharsets.UTF_8); @@ -618,6 +685,11 @@ public class GenerativeLanguageModel { if (!isCjk[i]) { dout.write(trigramTables[i]); } + if (scriptTables != null && scriptTables[i] != null) { + dout.write(scriptTables[i]); + } else { + dout.write(new byte[SCRIPT_CATEGORIES]); + } } dout.flush(); } @@ -638,6 +710,7 @@ public class GenerativeLanguageModel { private final Map<String, long[]> unigramCounts = new HashMap<>(); private final Map<String, long[]> bigramCounts = new HashMap<>(); private final Map<String, long[]> trigramCounts = new HashMap<>(); + private final Map<String, long[]> scriptCounts = new HashMap<>(); /** * Register a language before feeding it samples. Must be called @@ -652,6 +725,7 @@ public class GenerativeLanguageModel { if (!isCjk) { trigramCounts.put(langCode, new long[NONCJK_TRIGRAM_BUCKETS]); } + scriptCounts.put(langCode, new long[SCRIPT_CATEGORIES]); return this; } @@ -683,6 +757,8 @@ public class GenerativeLanguageModel { h -> bg[h % NONCJK_BIGRAM_BUCKETS]++, h -> tg[h % NONCJK_TRIGRAM_BUCKETS]++); } + + accumulateScriptCounts(pp, scriptCounts.get(langCode)); return this; } @@ -695,21 +771,24 @@ public class GenerativeLanguageModel { List<String> ids = new ArrayList<>(cjkFlags.keySet()); int n = ids.size(); - boolean[] cjkArr = new boolean[n]; - byte[][] uniTables = new byte[n][]; - byte[][] biTables = new byte[n][]; - byte[][] triTables = new byte[n][]; + boolean[] cjkArr = new boolean[n]; + byte[][] uniTables = new byte[n][]; + byte[][] biTables = new byte[n][]; + byte[][] triTables = new byte[n][]; + byte[][] scriptTbls = new byte[n][]; for (int i = 0; i < n; i++) { String lang = ids.get(i); - cjkArr[i] = cjkFlags.get(lang); - uniTables[i] = toLogProbTable(unigramCounts.get(lang), addK); - biTables[i] = toLogProbTable(bigramCounts.get(lang), addK); + cjkArr[i] = cjkFlags.get(lang); + uniTables[i] = toLogProbTable(unigramCounts.get(lang), addK); + biTables[i] = toLogProbTable(bigramCounts.get(lang), addK); if (!cjkArr[i]) { triTables[i] = toLogProbTable(trigramCounts.get(lang), addK); } + scriptTbls[i] = toLogProbTable(scriptCounts.get(lang), addK); } - return new GenerativeLanguageModel(ids, cjkArr, uniTables, biTables, triTables, + return new GenerativeLanguageModel(ids, cjkArr, + uniTables, biTables, triTables, scriptTbls, new float[n], new float[n]); } @@ -726,5 +805,21 @@ public class GenerativeLanguageModel { } return table; } + + private static void accumulateScriptCounts(String preprocessed, long[] dest) { + int i = 0; + int len = preprocessed.length(); + while (i < len) { + int cp = preprocessed.codePointAt(i); + i += Character.charCount(cp); + if (Character.isLetter(cp)) { + int lower = Character.toLowerCase(cp); + int script = ScriptCategory.of(lower); + if (script < dest.length) { + dest[script]++; + } + } + } + } } } diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java index c97a6bdf6a..32dd1918cc 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java @@ -45,31 +45,48 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { * model file always reflects the real inference-time feature set. */ public static final int FEATURE_FLAGS = + CharSoupModel.FLAG_TRIGRAMS + | CharSoupModel.FLAG_SUFFIXES + | CharSoupModel.FLAG_PREFIX + | CharSoupModel.FLAG_WORD_UNIGRAMS + | CharSoupModel.FLAG_CHAR_UNIGRAMS + | CharSoupModel.FLAG_SCRIPT_BLOCKS; + + /** Flags used by models trained before script block features were added. */ + public static final int FEATURE_FLAGS_LEGACY = CharSoupModel.FLAG_TRIGRAMS | CharSoupModel.FLAG_SUFFIXES | CharSoupModel.FLAG_PREFIX | CharSoupModel.FLAG_WORD_UNIGRAMS | CharSoupModel.FLAG_CHAR_UNIGRAMS; - static final int BIGRAM_BASIS = 0x811c9dc5; - static final int TRIGRAM_BASIS = 0x9f4e3c21; - static final int UNIGRAM_BASIS = 0x2f4a3c17; - static final int WORD_BASIS = 0x4a1c7b39; - static final int SUFFIX_BASIS = 0x7e2b1a8f; - static final int PREFIX_BASIS = 0x3b7e9f12; + static final int BIGRAM_BASIS = 0x811c9dc5; + static final int TRIGRAM_BASIS = 0x9f4e3c21; + static final int UNIGRAM_BASIS = 0x2f4a3c17; + static final int WORD_BASIS = 0x4a1c7b39; + static final int SUFFIX_BASIS = 0x7e2b1a8f; + static final int PREFIX_BASIS = 0x3b7e9f12; + public static final int SCRIPT_BASIS = 0x5d8c2e71; + public static final int SCRIPT_TRANS_BASIS = 0x6f1a4b93; static final int MAX_WORD_LENGTH = 30; static final int MIN_WORD_LENGTH = 2; static final int SENTINEL = '_'; private final int numBuckets; + private final boolean useScriptBlocks; public ScriptAwareFeatureExtractor(int numBuckets) { + this(numBuckets, true); + } + + public ScriptAwareFeatureExtractor(int numBuckets, boolean useScriptBlocks) { if (numBuckets <= 0) { throw new IllegalArgumentException( "numBuckets must be positive: " + numBuckets); } this.numBuckets = numBuckets; + this.useScriptBlocks = useScriptBlocks; } @Override @@ -135,6 +152,11 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { int suf0 = SENTINEL, suf1 = SENTINEL, suf2 = SENTINEL, suf3 = SENTINEL; int preA = SENTINEL, preB = SENTINEL, preC = SENTINEL; + int[] scriptCounts = useScriptBlocks ? new int[ScriptCategory.COUNT] : null; + int[] transitionCounts = useScriptBlocks + ? new int[ScriptCategory.COUNT * ScriptCategory.COUNT] : null; + int lastLetterScript = -1; + int i = 0; int len = text.length(); while (i < len) { @@ -150,6 +172,14 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { int script = ScriptCategory.of(lower); boolean cjk = isCjkScript(script); + if (useScriptBlocks) { + scriptCounts[script]++; + if (lastLetterScript >= 0 && lastLetterScript != script) { + transitionCounts[lastLetterScript * ScriptCategory.COUNT + script]++; + } + lastLetterScript = script; + } + if (prevWasLetter) { if (!sameFamily(script, prevScript)) { emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk, @@ -249,6 +279,10 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { wordHash, wordLen, wordScript, suf0, suf1, suf2, suf3, preA, preB, preC); } + + if (useScriptBlocks) { + emitScriptFeatures(counts, scriptCounts, transitionCounts); + } } private void emitBoundaryStart(int[] counts, int script, int lower, boolean cjk) { @@ -310,10 +344,61 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { } } + public static final int SCRIPT_SCALE = 100; + + private void emitScriptFeatures(int[] counts, + int[] scriptCounts, + int[] transitionCounts) { + int totalLetters = 0; + for (int c : scriptCounts) { + totalLetters += c; + } + if (totalLetters == 0) { + return; + } + + for (int s = 0; s < ScriptCategory.COUNT; s++) { + if (scriptCounts[s] > 0) { + int weight = (int) Math.round( + (double) SCRIPT_SCALE * scriptCounts[s] / totalLetters); + if (weight > 0) { + int h = fnvFeedByte(SCRIPT_BASIS, s); + counts[(h & 0x7FFFFFFF) % numBuckets] += weight; + } + } + } + + int totalTransitions = 0; + for (int c : transitionCounts) { + totalTransitions += c; + } + if (totalTransitions == 0) { + return; + } + + for (int s = 0; s < ScriptCategory.COUNT; s++) { + for (int t = 0; t < ScriptCategory.COUNT; t++) { + int c = transitionCounts[s * ScriptCategory.COUNT + t]; + if (c > 0) { + int weight = (int) Math.round( + (double) SCRIPT_SCALE * c / totalTransitions); + if (weight > 0) { + int h = fnvFeedByte(fnvFeedByte(SCRIPT_TRANS_BASIS, s), t); + counts[(h & 0x7FFFFFFF) % numBuckets] += weight; + } + } + } + } + } + // ---- Script helpers ---- - private static boolean isCjkScript(int script) { + public static boolean isCjkScript(int script) { return script == ScriptCategory.HAN + || script == ScriptCategory.HAN_EXT_A + || script == ScriptCategory.HAN_EXT_B + || script == ScriptCategory.HAN_COMPAT + || script == ScriptCategory.BOPOMOFO || script == ScriptCategory.HIRAGANA || script == ScriptCategory.KATAKANA; } @@ -357,6 +442,6 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { @Override public int getFeatureFlags() { - return FEATURE_FLAGS; + return useScriptBlocks ? FEATURE_FLAGS : FEATURE_FLAGS_LEGACY; } } diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java index cbd19d824f..37370e9e6e 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java @@ -56,14 +56,22 @@ public final class ScriptCategory { public static final int TIBETAN = 18; public static final int KHMER = 19; + // CJK sub-blocks — finer-grained Han categories for distinguishing + // Simplified Chinese, Traditional Chinese, Japanese kanji, and Korean hanja. + public static final int HAN_EXT_A = 20; // U+3400–U+4DBF + public static final int HAN_EXT_B = 21; // U+20000–U+2A6DF (and later extensions) + public static final int HAN_COMPAT = 22; // U+F900–U+FAFF + public static final int BOPOMOFO = 23; // U+3100–U+312F, U+31A0–U+31BF + /** Number of distinct categories. */ - public static final int COUNT = 20; + public static final int COUNT = 24; private static final String[] NAMES = { "LATIN", "CYRILLIC", "ARABIC", "HAN", "HANGUL", "HIRAGANA", "KATAKANA", "DEVANAGARI", "THAI", "GREEK", "HEBREW", "BENGALI", "GEORGIAN", "ARMENIAN", "ETHIOPIC", "OTHER", - "CANADIAN_ABORIGINAL", "MYANMAR", "TIBETAN", "KHMER" + "CANADIAN_ABORIGINAL", "MYANMAR", "TIBETAN", "KHMER", + "HAN_EXT_A", "HAN_EXT_B", "HAN_COMPAT", "BOPOMOFO" }; private ScriptCategory() { @@ -84,10 +92,42 @@ public final class ScriptCategory { if (cp < 0x0080) { return LATIN; } + + // Bopomofo (Traditional Chinese phonetic) — check before UnicodeScript + // because Java maps these to BOPOMOFO script, not HAN. + if ((cp >= 0x3100 && cp <= 0x312F) || (cp >= 0x31A0 && cp <= 0x31BF)) { + return BOPOMOFO; + } + Character.UnicodeScript us = Character.UnicodeScript.of(cp); + + // Sub-block routing for HAN codepoints + if (us == Character.UnicodeScript.HAN) { + return hanSubBlock(cp); + } + return fromUnicodeScript(us); } + /** + * Route a HAN codepoint to a sub-block category. + * The common CJK Unified Ideographs block (U+4E00–U+9FFF) maps to {@link #HAN}. + * Rarer blocks that correlate with Traditional Chinese or variant forms + * get their own category for finer-grained language discrimination. + */ + static int hanSubBlock(int cp) { + if (cp >= 0x3400 && cp <= 0x4DBF) { + return HAN_EXT_A; + } + if (cp >= 0xF900 && cp <= 0xFAFF) { + return HAN_COMPAT; + } + if (cp >= 0x20000) { + return HAN_EXT_B; + } + return HAN; + } + /** * Map a {@link Character.UnicodeScript} to a category. */ diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java index 912461d54c..234d97e2b9 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java @@ -42,28 +42,42 @@ public class ShortTextFeatureExtractor implements FeatureExtractor { * describes the features this extractor emits. */ public static final int FEATURE_FLAGS = + CharSoupModel.FLAG_TRIGRAMS + | CharSoupModel.FLAG_WORD_UNIGRAMS + | CharSoupModel.FLAG_4GRAMS + | CharSoupModel.FLAG_SCRIPT_BLOCKS; + + public static final int FEATURE_FLAGS_LEGACY = CharSoupModel.FLAG_TRIGRAMS | CharSoupModel.FLAG_WORD_UNIGRAMS | CharSoupModel.FLAG_4GRAMS; - static final int BIGRAM_BASIS = 0x811c9dc5; - static final int TRIGRAM_BASIS = 0x9f4e3c21; - static final int FOURGRAM_BASIS = 0xa3d8f215; - static final int UNIGRAM_BASIS = 0x2f4a3c17; - static final int WORD_BASIS = 0x4a1c7b39; + static final int BIGRAM_BASIS = 0x811c9dc5; + static final int TRIGRAM_BASIS = 0x9f4e3c21; + static final int FOURGRAM_BASIS = 0xa3d8f215; + static final int UNIGRAM_BASIS = 0x2f4a3c17; + static final int WORD_BASIS = 0x4a1c7b39; + static final int SCRIPT_BASIS = ScriptAwareFeatureExtractor.SCRIPT_BASIS; + static final int SCRIPT_TRANS_BASIS = ScriptAwareFeatureExtractor.SCRIPT_TRANS_BASIS; static final int MAX_WORD_LENGTH = 30; static final int MIN_WORD_LENGTH = 2; static final int SENTINEL = '_'; private final int numBuckets; + private final boolean useScriptBlocks; public ShortTextFeatureExtractor(int numBuckets) { + this(numBuckets, true); + } + + public ShortTextFeatureExtractor(int numBuckets, boolean useScriptBlocks) { if (numBuckets <= 0) { throw new IllegalArgumentException( "numBuckets must be positive: " + numBuckets); } this.numBuckets = numBuckets; + this.useScriptBlocks = useScriptBlocks; } @Override @@ -127,14 +141,17 @@ public class ShortTextFeatureExtractor implements FeatureExtractor { int wordHash = WORD_BASIS; int wordLen = 0; int wordScript = -1; - // rolling suffix window for 4-gram boundary emissions int suf1 = SENTINEL; int suf2 = SENTINEL; int suf3 = SENTINEL; - // prefix window for boundary 4-gram at word start int preA = SENTINEL; int preB = SENTINEL; + int[] scriptCounts = useScriptBlocks ? new int[ScriptCategory.COUNT] : null; + int[] transitionCounts = useScriptBlocks + ? new int[ScriptCategory.COUNT * ScriptCategory.COUNT] : null; + int lastLetterScript = -1; + int i = 0; int len = text.length(); while (i < len) { @@ -150,6 +167,14 @@ public class ShortTextFeatureExtractor implements FeatureExtractor { int script = ScriptCategory.of(lower); boolean cjk = isCjkScript(script); + if (useScriptBlocks) { + scriptCounts[script]++; + if (lastLetterScript >= 0 && lastLetterScript != script) { + transitionCounts[lastLetterScript * ScriptCategory.COUNT + script]++; + } + lastLetterScript = script; + } + if (prevWasLetter) { if (!sameFamily(script, prevScript)) { emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk, @@ -247,6 +272,10 @@ public class ShortTextFeatureExtractor implements FeatureExtractor { emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk, wordHash, wordLen, wordScript, suf1, suf2, suf3); } + + if (useScriptBlocks) { + emitScriptFeatures(counts, scriptCounts, transitionCounts); + } } private void emitBoundaryStart(int[] counts, int script, int lower, boolean cjk) { @@ -302,10 +331,55 @@ public class ShortTextFeatureExtractor implements FeatureExtractor { counts[(h & 0x7FFFFFFF) % numBuckets]++; } + static final int SCRIPT_SCALE = ScriptAwareFeatureExtractor.SCRIPT_SCALE; + + private void emitScriptFeatures(int[] counts, + int[] scriptCounts, + int[] transitionCounts) { + int totalLetters = 0; + for (int c : scriptCounts) { + totalLetters += c; + } + if (totalLetters == 0) { + return; + } + + for (int s = 0; s < ScriptCategory.COUNT; s++) { + if (scriptCounts[s] > 0) { + int weight = (int) Math.round( + (double) SCRIPT_SCALE * scriptCounts[s] / totalLetters); + if (weight > 0) { + int h = fnvFeedByte(SCRIPT_BASIS, s); + counts[(h & 0x7FFFFFFF) % numBuckets] += weight; + } + } + } + + int totalTransitions = 0; + for (int c : transitionCounts) { + totalTransitions += c; + } + if (totalTransitions == 0) { + return; + } + + for (int s = 0; s < ScriptCategory.COUNT; s++) { + for (int t = 0; t < ScriptCategory.COUNT; t++) { + int c = transitionCounts[s * ScriptCategory.COUNT + t]; + if (c > 0) { + int weight = (int) Math.round( + (double) SCRIPT_SCALE * c / totalTransitions); + if (weight > 0) { + int h = fnvFeedByte(fnvFeedByte(SCRIPT_TRANS_BASIS, s), t); + counts[(h & 0x7FFFFFFF) % numBuckets] += weight; + } + } + } + } + } + private static boolean isCjkScript(int script) { - return script == ScriptCategory.HAN - || script == ScriptCategory.HIRAGANA - || script == ScriptCategory.KATAKANA; + return ScriptAwareFeatureExtractor.isCjkScript(script); } private static boolean sameFamily(int a, int b) { @@ -343,6 +417,6 @@ public class ShortTextFeatureExtractor implements FeatureExtractor { @Override public int getFeatureFlags() { - return FEATURE_FLAGS; + return useScriptBlocks ? FEATURE_FLAGS : FEATURE_FLAGS_LEGACY; } } diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java index 99962e60ab..5bd32d7f8a 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java @@ -210,7 +210,7 @@ public class CharSoupLanguageDetector extends LanguageDetector implements SelfCo FeatureExtractor shortExtractor = null; try { shortModel = CharSoupModel.loadFromClasspath(SHORT_TEXT_MODEL_RESOURCE); - shortExtractor = new ShortTextFeatureExtractor(shortModel.getNumBuckets()); + shortExtractor = shortModel.createExtractor(); verifyFlagsMatch(shortModel, shortExtractor, SHORT_TEXT_MODEL_RESOURCE); SHORT_TEXT_GROUP_INDICES = buildGroupIndices(shortModel); SHORT_TEXT_CLASS_SCRIPT = buildClassScript(shortModel); diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java index 7828a59eef..fe1164d756 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java @@ -29,12 +29,15 @@ public class ScriptAwareFeatureExtractorTest { private static final int NUM_BUCKETS = 8192; + private static ScriptAwareFeatureExtractor ngramOnly() { + return new ScriptAwareFeatureExtractor(NUM_BUCKETS, false); + } + // ---- Basic sanity ---- @Test public void testEmptyAndNull() { - ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ScriptAwareFeatureExtractor ext = ngramOnly(); int[] counts = ext.extract(null); assertEquals(NUM_BUCKETS, counts.length); assertEquals(0, sum(counts)); @@ -45,8 +48,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testSingleWord() { - ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ScriptAwareFeatureExtractor ext = ngramOnly(); int[] counts = ext.extract("hello"); // "hello" (production config: bigrams + trigrams + suffix + prefix + word): // bigrams: (_,h) (h,e) (e,l) (l,l) (l,o) (o,_) = 6 @@ -61,7 +63,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testCjkUnigrams() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "中文": no sentinels for CJK // bigrams: (中,文) = 1 // unigrams: 中, 文 = 2 @@ -73,7 +75,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testHiraganaUnigrams() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "あい": no sentinels for kana // bigrams: (あ,い) = 1 // unigrams: あ, い = 2 @@ -85,7 +87,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testKatakanaUnigrams() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "アイ": same as hiragana int[] counts = ext.extract("アイ"); assertEquals(3, sum(counts)); @@ -96,7 +98,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testCjkSpaceBridging() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "中 文" with space should produce same features as "中文" // The space is bridged for CJK int[] withSpace = ext.extract("中 文"); @@ -110,7 +112,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testCjkPunctuationBreaks() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "中。文" — punctuation IS a real break int[] withPunct = ext.extract("中。文"); int[] noSpace = ext.extract("中文"); @@ -131,7 +133,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testLatinAndCyrillicDontCollide() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); int[] latin = ext.extract("ab"); int[] cyrillic = ext.extract("аб"); assertNotEquals(0, sum(latin)); @@ -152,7 +154,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testJapaneseScriptFamilyNoBoundary() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "漢あア" — Han + Hiragana + Katakana // All are CJK family, so no boundary between them. // bigrams: (漢,あ) (あ,ア) = 2 @@ -165,7 +167,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testJapaneseVsLatinCreatesBoundary() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "漢a" — Han then Latin: different family → boundary // Han part: (漢) = 1 unigram (no sentinels for CJK) // Latin part: (_,a) (a,_) = 2 bigrams (sentinels) @@ -177,7 +179,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testHanHiraganaBigramChain() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "食べる" — Han(食) Hiragana(べ) Hiragana(る) // bigrams: (食,べ) (べ,る) = 2 // unigrams: 食, べ, る = 3 @@ -191,7 +193,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testScriptChangeCreatesBoundary() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "abаб" — Latin "ab" followed by Cyrillic "аб" int[] mixed = ext.extract("abаб"); @@ -213,7 +215,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testWordUnigrams() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "abc" (production config): // bigrams: (_,a) (a,b) (b,c) (c,_) = 4 // trigrams: (_,a,b) (a,b,c) (b,c,_) = 3 @@ -228,7 +230,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testSingleCharWordNoWordUnigram() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); // "a" — single char word: bigrams only, no trigram/suffix/prefix/word unigram // bigrams: (_,a) (a,_) = 2 // total = 2 @@ -241,7 +243,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testArabicDiacriticsTransparent() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); int[] plain = ext.extract("كتب"); int[] diacritics = ext.extract("كَتَبَ"); for (int i = 0; i < NUM_BUCKETS; i++) { @@ -255,7 +257,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testExtractFromPreprocessed() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); String raw = "Hello https://example.com world"; String preprocessed = CharSoupFeatureExtractor.preprocess(raw); @@ -270,7 +272,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testExtractFromPreprocessedAccumulate() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); int[] counts = ext.extract("hello"); int sum1 = sum(counts); String preprocessed = @@ -378,7 +380,7 @@ public class ScriptAwareFeatureExtractorTest { @RepeatedTest(10) public void testRandomSurrogatePairsAndEdgeCases() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); String[] pathological = { new String(new char[]{0xD800, 0xD801, 0xD802}), @@ -410,7 +412,7 @@ public class ScriptAwareFeatureExtractorTest { @Test public void testDeterministic() { ScriptAwareFeatureExtractor ext = - new ScriptAwareFeatureExtractor(NUM_BUCKETS); + ngramOnly(); String text = "The quick brown fox 快速的棕色狐狸 прыгнул через"; int[] first = ext.extract(text); @@ -421,6 +423,45 @@ public class ScriptAwareFeatureExtractorTest { } } + // ---- Script block features ---- + + @Test + public void testScriptBlocksAddWeight() { + ScriptAwareFeatureExtractor withBlocks = + new ScriptAwareFeatureExtractor(NUM_BUCKETS, true); + int[] counts = withBlocks.extract("hello"); + // n-gram features = 14 (same as testSingleWord) + // + script presence: 100% LATIN → weight 100 + // total = 114 + assertEquals(114, sum(counts)); + } + + @Test + public void testScriptBlocksMixedScript() { + ScriptAwareFeatureExtractor withBlocks = + new ScriptAwareFeatureExtractor(NUM_BUCKETS, true); + int[] withScript = withBlocks.extract("hello世界"); + int[] noScript = ngramOnly().extract("hello世界"); + // Script features add presence weights (L1-normalized to 100) + // plus at least one transition (LATIN→HAN) + assertTrue(sum(withScript) > sum(noScript), + "Script block features should add weight"); + int scriptContribution = sum(withScript) - sum(noScript); + // Presence: ~71% LATIN + ~29% HAN = 100 total presence + // Transition: 100% LATIN→HAN = 100 total transition + // Sum should be 200 + assertEquals(200, scriptContribution); + } + + @Test + public void testScriptBlocksDisabled() { + int[] withBlocks = new ScriptAwareFeatureExtractor(NUM_BUCKETS, true) + .extract("hello"); + int[] withoutBlocks = ngramOnly().extract("hello"); + assertTrue(sum(withBlocks) > sum(withoutBlocks)); + assertEquals(14, sum(withoutBlocks)); + } + // ---- Helpers ---- private int sum(int[] arr) { diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java index f7713abcb0..56676b1c3f 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java @@ -21,6 +21,7 @@ import java.util.Arrays; import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor; import org.apache.tika.langdetect.charsoup.CharSoupModel; import org.apache.tika.langdetect.charsoup.FeatureExtractor; +import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor; import org.apache.tika.langdetect.charsoup.ScriptCategory; /** @@ -47,10 +48,13 @@ public class ResearchFeatureExtractor implements FeatureExtractor { static final int CHAR_UNIGRAM_BASIS = 0x1d4f8c3a; static final int FOURGRAM_BASIS = 0xa3d8f215; static final int FIVEGRAM_BASIS = 0xc7b46e38; + static final int SCRIPT_BASIS = ScriptAwareFeatureExtractor.SCRIPT_BASIS; + static final int SCRIPT_TRANS_BASIS = ScriptAwareFeatureExtractor.SCRIPT_TRANS_BASIS; static final int MAX_WORD_LENGTH = 30; static final int MIN_WORD_LENGTH = 2; static final int SENTINEL = '_'; + static final int SCRIPT_SCALE = ScriptAwareFeatureExtractor.SCRIPT_SCALE; private final int numBuckets; private final boolean useTrigrams; @@ -62,10 +66,11 @@ public class ResearchFeatureExtractor implements FeatureExtractor { private final boolean useCharUnigrams; private final boolean use4grams; private final boolean use5grams; + private final boolean useScriptBlocks; /** Minimal constructor: bigrams + word unigrams + CJK unigrams. */ public ResearchFeatureExtractor(int numBuckets) { - this(numBuckets, false, false, false, false, false, true, false, false, false); + this(numBuckets, false, false, false, false, false, true, false, false, false, false); } /** Full-config constructor. All features share the same flat bucket space. */ @@ -79,6 +84,22 @@ public class ResearchFeatureExtractor implements FeatureExtractor { boolean useCharUnigrams, boolean use4grams, boolean use5grams) { + this(numBuckets, useTrigrams, useSkipBigrams, useSuffixes, useSuffix4, + usePrefix, useWordUnigrams, useCharUnigrams, use4grams, use5grams, false); + } + + /** Full-config constructor including script block features. */ + public ResearchFeatureExtractor(int numBuckets, + boolean useTrigrams, + boolean useSkipBigrams, + boolean useSuffixes, + boolean useSuffix4, + boolean usePrefix, + boolean useWordUnigrams, + boolean useCharUnigrams, + boolean use4grams, + boolean use5grams, + boolean useScriptBlocks) { if (numBuckets <= 0) { throw new IllegalArgumentException( "numBuckets must be positive: " + numBuckets); @@ -93,6 +114,7 @@ public class ResearchFeatureExtractor implements FeatureExtractor { this.useCharUnigrams = useCharUnigrams; this.use4grams = use4grams; this.use5grams = use5grams; + this.useScriptBlocks = useScriptBlocks; } @Override @@ -155,6 +177,11 @@ public class ResearchFeatureExtractor implements FeatureExtractor { int preB = SENTINEL; int preC = SENTINEL; + int[] scriptCounts = useScriptBlocks ? new int[ScriptCategory.COUNT] : null; + int[] transitionCounts = useScriptBlocks + ? new int[ScriptCategory.COUNT * ScriptCategory.COUNT] : null; + int lastLetterScript = -1; + int i = 0; int len = text.length(); while (i < len) { @@ -170,6 +197,14 @@ public class ResearchFeatureExtractor implements FeatureExtractor { int script = ScriptCategory.of(lower); boolean cjk = isCjkScript(script); + if (useScriptBlocks) { + scriptCounts[script]++; + if (lastLetterScript >= 0 && lastLetterScript != script) { + transitionCounts[lastLetterScript * ScriptCategory.COUNT + script]++; + } + lastLetterScript = script; + } + if (prevWasLetter) { if (!sameFamily(script, prevScript)) { emitBoundaryEnd(counts, prevScript, prevCp, prevWasCjk, @@ -299,6 +334,55 @@ public class ResearchFeatureExtractor implements FeatureExtractor { wordHash, wordLen, wordScript, suf0, suf1, suf2, suf3, preA, preB, preC); } + + if (useScriptBlocks) { + emitScriptFeatures(counts, scriptCounts, transitionCounts); + } + } + + private void emitScriptFeatures(int[] counts, + int[] scriptCounts, + int[] transitionCounts) { + int totalLetters = 0; + for (int c : scriptCounts) { + totalLetters += c; + } + if (totalLetters == 0) { + return; + } + + for (int s = 0; s < ScriptCategory.COUNT; s++) { + if (scriptCounts[s] > 0) { + int weight = (int) Math.round( + (double) SCRIPT_SCALE * scriptCounts[s] / totalLetters); + if (weight > 0) { + int h = fnvFeedByte(SCRIPT_BASIS, s); + counts[(h & 0x7FFFFFFF) % numBuckets] += weight; + } + } + } + + int totalTransitions = 0; + for (int c : transitionCounts) { + totalTransitions += c; + } + if (totalTransitions == 0) { + return; + } + + for (int s = 0; s < ScriptCategory.COUNT; s++) { + for (int t = 0; t < ScriptCategory.COUNT; t++) { + int c = transitionCounts[s * ScriptCategory.COUNT + t]; + if (c > 0) { + int weight = (int) Math.round( + (double) SCRIPT_SCALE * c / totalTransitions); + if (weight > 0) { + int h = fnvFeedByte(fnvFeedByte(SCRIPT_TRANS_BASIS, s), t); + counts[(h & 0x7FFFFFFF) % numBuckets] += weight; + } + } + } + } } private void emitBoundaryStart(int[] counts, int script, int lower, boolean cjk) { @@ -402,9 +486,7 @@ public class ResearchFeatureExtractor implements FeatureExtractor { } private static boolean isCjkScript(int script) { - return script == ScriptCategory.HAN - || script == ScriptCategory.HIRAGANA - || script == ScriptCategory.KATAKANA; + return ScriptAwareFeatureExtractor.isCjkScript(script); } private static boolean sameFamily(int a, int b) { @@ -452,6 +534,7 @@ public class ResearchFeatureExtractor implements FeatureExtractor { if (useCharUnigrams) flags |= CharSoupModel.FLAG_CHAR_UNIGRAMS; if (use4grams) flags |= CharSoupModel.FLAG_4GRAMS; if (use5grams) flags |= CharSoupModel.FLAG_5GRAMS; + if (useScriptBlocks) flags |= CharSoupModel.FLAG_SCRIPT_BLOCKS; return flags; } }
