This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 98b8758  OPENNLP-1197: support Japanese letters in FeatureGeneratorUtil
98b8758 is described below

commit 98b8758e761caf659608f48fc5c9e9056f911050
Author: koji <[email protected]>
AuthorDate: Fri May 18 09:38:16 2018 +0900

    OPENNLP-1197: support Japanese letters in FeatureGeneratorUtil
---
 .../util/featuregen/FeatureGeneratorUtil.java      | 10 +++-
 .../tools/util/featuregen/StringPattern.java       | 57 +++++++++++++++++++---
 .../util/featuregen/FeatureGeneratorUtilTest.java  | 19 ++++++++
 .../tools/util/featuregen/StringPatternTest.java   | 26 ++++++++++
 4 files changed, 103 insertions(+), 9 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
index 79c2a50..e6b8af9 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java
@@ -34,6 +34,8 @@ public class FeatureGeneratorUtil {
    * Generates a class name for the specified token.
    * The classes are as follows where the first matching class is used:
    * <ul>
+   * <li>jah - Japanese Hiragana</li>
+   * <li>jak - Japanese Katakana</li>
    * <li>lc - lowercase alphabetic</li>
    * <li>2d - two digits </li>
    * <li>4d - four digits </li>
@@ -56,7 +58,13 @@ public class FeatureGeneratorUtil {
     StringPattern pattern = StringPattern.recognize(token);
 
     String feat;
-    if (pattern.isAllLowerCaseLetter()) {
+    if (pattern.isAllHiragana()) {
+      feat = "jah";
+    }
+    else if (pattern.isAllKatakana()) {
+      feat = "jak";
+    }
+    else if (pattern.isAllLowerCaseLetter()) {
       feat = "lc";
     }
     else if (pattern.digits() == 2) {
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
index eae7bc4..458912f 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java
@@ -27,13 +27,15 @@ public class StringPattern {
   private static final int ALL_LOWERCASE_LETTER = 0x1 << 2;
   private static final int ALL_LETTERS = 0x1 << 3;
   private static final int ALL_DIGIT = 0x1 << 4;
-  private static final int CONTAINS_PERIOD = 0x1 << 5;
-  private static final int CONTAINS_COMMA = 0x1 << 6;
-  private static final int CONTAINS_SLASH = 0x1 << 7;
-  private static final int CONTAINS_DIGIT = 0x1 << 8;
-  private static final int CONTAINS_HYPHEN = 0x1 << 9;
-  private static final int CONTAINS_LETTERS = 0x1 << 10;
-  private static final int CONTAINS_UPPERCASE = 0x1 << 11;
+  private static final int ALL_HIRAGANA = 0x1 << 5;
+  private static final int ALL_KATAKANA = 0x1 << 6;
+  private static final int CONTAINS_PERIOD = 0x1 << 7;
+  private static final int CONTAINS_COMMA = 0x1 << 8;
+  private static final int CONTAINS_SLASH = 0x1 << 9;
+  private static final int CONTAINS_DIGIT = 0x1 << 10;
+  private static final int CONTAINS_HYPHEN = 0x1 << 11;
+  private static final int CONTAINS_LETTERS = 0x1 << 12;
+  private static final int CONTAINS_UPPERCASE = 0x1 << 13;
 
   private final int pattern;
 
@@ -46,7 +48,8 @@ public class StringPattern {
 
   public static StringPattern recognize(String token) {
 
-    int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT | 
ALL_LETTERS;
+    int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT | 
ALL_LETTERS
+        | ALL_HIRAGANA | ALL_KATAKANA;
 
     int digits = 0;
 
@@ -83,6 +86,7 @@ public class StringPattern {
 
         if (letterType == Character.DECIMAL_DIGIT_NUMBER) {
           pattern |= CONTAINS_DIGIT;
+          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
           digits++;
         } else {
           pattern &= ~ALL_DIGIT;
@@ -109,6 +113,29 @@ public class StringPattern {
             break;
         }
       }
+
+      // for Japanese...
+      final int codePoint = token.codePointAt(i);
+      final Character.UnicodeScript us = Character.UnicodeScript.of(codePoint);
+      if (us != Character.UnicodeScript.COMMON) {
+        if (us == Character.UnicodeScript.LATIN) {
+          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
+        }
+        else if (us == Character.UnicodeScript.HAN) {
+          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA | ALL_LOWERCASE_LETTER);
+        }
+        else if (us == Character.UnicodeScript.HIRAGANA) {
+          pattern &= ~(ALL_KATAKANA | ALL_LOWERCASE_LETTER);
+        }
+        else if (us == Character.UnicodeScript.KATAKANA) {
+          pattern &= ~(ALL_HIRAGANA | ALL_LOWERCASE_LETTER);
+        }
+      }
+      else {
+        if (ch == ',' || ch == '.' || ch == '?' || ch == '!') {
+          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
+        }
+      }
     }
 
     return new StringPattern(pattern, digits);
@@ -150,6 +177,20 @@ public class StringPattern {
   }
 
   /**
+   * @return true if all chars are hiragana.
+   */
+  public boolean isAllHiragana() {
+    return (pattern & ALL_HIRAGANA) > 0;
+  }
+
+  /**
+   * @return true if all chars are katakana.
+   */
+  public boolean isAllKatakana() {
+    return (pattern & ALL_KATAKANA) > 0;
+  }
+
+  /**
    * Retrieves the number of digits.
    */
   public int digits() {
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
index cca0d83..7d7f233 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java
@@ -41,5 +41,24 @@ public class FeatureGeneratorUtilTest {
     Assert.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("A."));
     Assert.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Mike"));
     Assert.assertEquals("other", 
FeatureGeneratorUtil.tokenFeature("somethingStupid"));
+
+    // symbols
+    Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature(","));
+    Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("."));
+    Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("?"));
+    Assert.assertEquals("other", FeatureGeneratorUtil.tokenFeature("!"));
+  }
+
+  @Test
+  public void testJapanese() {
+    // Hiragana
+    Assert.assertEquals("jah", FeatureGeneratorUtil.tokenFeature("そういえば"));
+    Assert.assertEquals("jah", 
FeatureGeneratorUtil.tokenFeature("おーぷん・そ〜す・そふとうぇあ"));
+    Assert.assertEquals("other", 
FeatureGeneratorUtil.tokenFeature("あぱっち・そふとうぇあ財団"));
+
+    // Katakana
+    Assert.assertEquals("jak", FeatureGeneratorUtil.tokenFeature("ジャパン"));
+    Assert.assertEquals("jak", 
FeatureGeneratorUtil.tokenFeature("オープン・ソ〜ス・ソフトウェア"));
+    Assert.assertEquals("other", 
FeatureGeneratorUtil.tokenFeature("アパッチ・ソフトウェア財団"));
   }
 }
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
index 187bb2f..75a7b8f 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/StringPatternTest.java
@@ -29,6 +29,7 @@ public class StringPatternTest {
     Assert.assertTrue(StringPattern.recognize("TesT").isAllLetter());
     Assert.assertTrue(StringPattern.recognize("grün").isAllLetter());
     Assert.assertTrue(StringPattern.recognize("üäöæß").isAllLetter());
+    Assert.assertTrue(StringPattern.recognize("あア亜Aa").isAllLetter());
   }
 
   @Test
@@ -37,6 +38,9 @@ public class StringPatternTest {
     
Assert.assertFalse(StringPattern.recognize("tEST").isInitialCapitalLetter());
     
Assert.assertTrue(StringPattern.recognize("TesT").isInitialCapitalLetter());
     
Assert.assertTrue(StringPattern.recognize("Üäöæß").isInitialCapitalLetter());
+    
Assert.assertFalse(StringPattern.recognize("いイ井").isInitialCapitalLetter());
+    
Assert.assertTrue(StringPattern.recognize("Iいイ井").isInitialCapitalLetter());
+    
Assert.assertTrue(StringPattern.recognize("Iいイ井").isInitialCapitalLetter());
   }
 
   @Test
@@ -45,6 +49,8 @@ public class StringPatternTest {
     
Assert.assertTrue(StringPattern.recognize("ÄÄÄÜÜÜÖÖÖÖ").isAllCapitalLetter());
     
Assert.assertFalse(StringPattern.recognize("ÄÄÄÜÜÜÖÖä").isAllCapitalLetter());
     
Assert.assertFalse(StringPattern.recognize("ÄÄÄÜÜdÜÖÖ").isAllCapitalLetter());
+    Assert.assertTrue(StringPattern.recognize("ABC").isAllCapitalLetter());
+    Assert.assertFalse(StringPattern.recognize("うウ宇").isAllCapitalLetter());
   }
 
   @Test
@@ -56,6 +62,8 @@ public class StringPatternTest {
     Assert.assertFalse(StringPattern.recognize("TEST").isAllLowerCaseLetter());
     
Assert.assertFalse(StringPattern.recognize("testT").isAllLowerCaseLetter());
     
Assert.assertFalse(StringPattern.recognize("tesÖt").isAllLowerCaseLetter());
+    Assert.assertTrue(StringPattern.recognize("abc").isAllLowerCaseLetter());
+    Assert.assertFalse(StringPattern.recognize("えエ絵").isAllLowerCaseLetter());
   }
 
   @Test
@@ -63,6 +71,21 @@ public class StringPatternTest {
     Assert.assertTrue(StringPattern.recognize("123456").isAllDigit());
     Assert.assertFalse(StringPattern.recognize("123,56").isAllDigit());
     Assert.assertFalse(StringPattern.recognize("12356f").isAllDigit());
+    Assert.assertTrue(StringPattern.recognize("123456").isAllDigit());
+  }
+
+  @Test
+  public void testIsAllHiragana() {
+    Assert.assertTrue(StringPattern.recognize("あぱっち・るしーん").isAllHiragana());
+    
Assert.assertFalse(StringPattern.recognize("あぱっち・そふとうぇあ財団").isAllHiragana());
+    
Assert.assertFalse(StringPattern.recognize("あぱっち・るしーんV1.0").isAllHiragana());
+  }
+
+  @Test
+  public void testIsAllKatakana() {
+    Assert.assertTrue(StringPattern.recognize("アパッチ・ルシーン").isAllKatakana());
+    
Assert.assertFalse(StringPattern.recognize("アパッチ・ソフトウェア財団").isAllKatakana());
+    
Assert.assertFalse(StringPattern.recognize("アパッチ・ルシーンV1.0").isAllKatakana());
   }
 
   @Test
@@ -70,6 +93,7 @@ public class StringPatternTest {
     Assert.assertEquals(6, StringPattern.recognize("123456").digits());
     Assert.assertEquals(3, StringPattern.recognize("123fff").digits());
     Assert.assertEquals(0, StringPattern.recognize("test").digits());
+    Assert.assertEquals(3, StringPattern.recognize("123fff").digits());
   }
 
   @Test
@@ -98,6 +122,8 @@ public class StringPatternTest {
     Assert.assertTrue(StringPattern.recognize("test1").containsDigit());
     Assert.assertTrue(StringPattern.recognize("23,5").containsDigit());
     Assert.assertFalse(StringPattern.recognize("test./-,").containsDigit());
+    Assert.assertTrue(StringPattern.recognize("テスト1").containsDigit());
+    Assert.assertFalse(StringPattern.recognize("テストTEST").containsDigit());
   }
 
   @Test

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to