Amire80 has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/211400

Change subject: Add segmentation for Ethiopic languages
......................................................................

Add segmentation for Ethiopic languages

Bug: T98345
Change-Id: I3a47e630dcbaff8f5b9c60b1a4d48b3db7d5606d
---
A segmentation/languages/SegmenterAm.js
M segmentation/languages/index.js
M tests/segmentation/SegmentationTests.json
A tests/segmentation/data/result-23.html
A tests/segmentation/data/test-23.html
5 files changed, 54 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/00/211400/1

diff --git a/segmentation/languages/SegmenterAm.js 
b/segmentation/languages/SegmenterAm.js
new file mode 100644
index 0000000..e3bdf38
--- /dev/null
+++ b/segmentation/languages/SegmenterAm.js
@@ -0,0 +1,43 @@
+var findAll = require( '../../lineardoc' ).Utils.findAll;
+
+/**
+ * Test a possible Amharic sentence boundary match
+ *
+ * @param {string} text The plaintext to segment
+ * @param {Object} match The possible boundary match (returned by regex.exec)
+ * @return {number|null} The boundary offset, or null if not a sentence 
boundary
+ */
+
+function findBoundary( text, match ) {
+       var tail = text.slice( match.index + 1, text.length );
+
+       // Trailing non-final punctuation: not a sentence boundary
+       if ( tail.match( /^[,;:]/ ) ) {
+               return null;
+       }
+
+       // Next word character is number or lower-case: not a sentence boundary
+       if ( tail.match( /^\W*[0-9a-z]/ ) ) {
+               return null;
+       }
+
+       // Include any closing punctuation and trailing space
+       return match.index + 1 + tail.match( /^['”"’]*\s*/ )[ 0 ].length;
+}
+
+/**
+ * Find English sentence boundaries
+ *
+ * @param {string} text The plaintext to segment
+ * @returns {number[]} Sentence boundary offsets
+ */
+function getBoundaries( text ) {
+       // Regex to find possible English sentence boundaries.
+       // Must not use a shared regex instance (re.lastIndex is used).
+       // In the Ethiopic script ። is used as a full stop.
+       return findAll( text, /[።!?]/g, findBoundary );
+}
+
+module.exports = {
+       getBoundaries: getBoundaries
+};
diff --git a/segmentation/languages/index.js b/segmentation/languages/index.js
index 766d654..f5e623b 100644
--- a/segmentation/languages/index.js
+++ b/segmentation/languages/index.js
@@ -1,12 +1,14 @@
 'use strict';
 
 module.exports.Segmenters = {
+       am: require( __dirname + '/SegmenterAm.js' ),
        en: require( __dirname + '/SegmenterEn.js' ),
        hi: require( __dirname + '/SegmenterHi.js' ),
        hy: require( __dirname + '/SegmenterHy.js' ),
        ja: require( __dirname + '/SegmenterJa.js' ),
        pa: require( __dirname + '/SegmenterHi.js' ),
        sa: require( __dirname + '/SegmenterHi.js' ),
+       ti: require( __dirname + '/SegmenterAm.js' ),
        zh: require( __dirname + '/SegmenterZh.js' ),
        default: require( __dirname + '/SegmenterDefault.js' )
 };
diff --git a/tests/segmentation/SegmentationTests.json 
b/tests/segmentation/SegmentationTests.json
index 2f6eb3c..9ca712a 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -1,4 +1,11 @@
 {
+       "am": [
+               {
+                       "desc": "Amharic segmentation - basic test",
+                       "source": "test-23.html",
+                       "result": "result-23.html"
+               }
+       ],
        "en": [
                {
                        "desc": "Simple paragraph test",
diff --git a/tests/segmentation/data/result-23.html 
b/tests/segmentation/data/result-23.html
new file mode 100644
index 0000000..19bdb26
--- /dev/null
+++ b/tests/segmentation/data/result-23.html
@@ -0,0 +1 @@
+<p id="0"><span class="cx-segment" data-segmentid="1">ቴዎድሮስ <a class="cx-link" 
data-linkid="2" href="/wiki/%E1%8C%A5%E1%88%AD_%E1%8D%AE" title="ጥር ፮">ጥር ፮</a> 
ቀን <a class="cx-link" data-linkid="3" 
href="/w/index.php?title=1811&#38;action=edit&#38;redlink=1" title="1811 (ገጹ ገና 
አልተጻፈም)">፲፰፻፲፩</a> ዓ.ም. ሻርጌ በተባለ ቦታ <a class="cx-link" data-linkid="4" 
href="/wiki/%E1%89%8B%E1%88%AB" title="ቋራ">ቋራ</a> ውስጥ፣ ከ<a class="cx-link" 
data-linkid="5" href="/wiki/%E1%8C%8E%E1%8A%95%E1%8B%B0%E1%88%AD" 
title="ጎንደር">ጎንደር ከተማ</a> በስተ ምዕራብ ተወለዱ። </span><span class="cx-segment" 
data-segmentid="6">የተወለዱትም አገሪቷ በባላባቶች ተከፋፍላ በምትመራበት-<a class="cx-link" 
data-linkid="7" 
href="/wiki/%E1%8B%98%E1%88%98%E1%8A%90_%E1%88%98%E1%88%B3%E1%8D%8D%E1%8A%95%E1%89%B5"
 title="ዘመነ መሳፍንት">ዘመነ መሳፍንት</a> በሚባለው ወቅት ነበር። </span><span class="cx-segment" 
data-segmentid="8">አባታቸው <a class="cx-link" data-linkid="9" 
href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&#38;action=edit&#38;redlink=1"
 title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a> <a class="cx-link" data-linkid="10" 
href="/w/index.php?title=%E1%8A%83%E1%8B%AD%E1%88%89_%E1%8B%88%E1%88%8D%E1%8B%B0_%E1%8C%8A%E1%8B%AE%E1%88%AD%E1%8C%8A%E1%88%B5&#38;action=edit&#38;redlink=1"
 title="ኃይሉ ወልደ ጊዮርጊስ (ገጹ ገና አልተጻፈም)">ኃይሉ ወልደ ጊዮርጊስ</a> የቋራ ገዢ ነበሩ። 
</span><span class="cx-segment" data-segmentid="11">ዓፄ ቴዎድሮስ በህጻንነታቸው የቄስ ትምህርት 
ከቀሰሙ በኋላ፣ የአጎታቸውንና በኋላም ለጥቂት ጊዜ የ<a class="cx-link" data-linkid="12" 
href="/wiki/%E1%8C%8E%E1%8C%83%E1%88%9D" title="ጎጃም">ጎጃሙን</a> ጦር መሪ የ<a 
class="cx-link" data-linkid="13" 
href="/w/index.php?title=%E1%8C%8E%E1%88%B9_%E1%8B%98%E1%8B%8D%E1%8B%B4&#38;action=edit&#38;redlink=1"
 title="ጎሹ ዘውዴ (ገጹ ገና አልተጻፈም)">ጎሹ ዘውዴ</a>ን ጦር ተቀላቀሉ። </span><span 
class="cx-segment" data-segmentid="14">በዚሁ የውትድርና ዘመናቸው ከፍተኛ ችሎታን ማስመዝገብ ስለጀመሩና 
ዝናቸው ስለተስፋፋ በ<a class="cx-link" data-linkid="15" 
href="/w/index.php?title=1839&#38;action=edit&#38;redlink=1" title="1839 (ገጹ ገና 
አልተጻፈም)">፲፰፻፴፱</a> ዓ.ም. በወይዘሮ <a class="cx-link" data-linkid="16" 
href="/w/index.php?title=%E1%88%98%E1%8A%90%E1%8A%95_%E1%88%8A%E1%89%A0%E1%8A%95_%E1%8A%A0%E1%88%9D%E1%8B%B4&#38;action=edit&#38;redlink=1"
 title="መነን ሊበን አምዴ (ገጹ ገና አልተጻፈም)">መነን ሊበን አምዴ</a> አነሳሽነት የልጇን የራስ <a 
class="cx-link" data-linkid="17" 
href="/w/index.php?title=%E1%8A%A0%E1%88%8A_%E1%8A%A0%E1%88%89%E1%88%8B&#38;action=edit&#38;redlink=1"
 title="አሊ አሉላ (ገጹ ገና አልተጻፈም)">አሊ አሉላ</a>ን ልጅ፣ <a class="cx-link" 
data-linkid="18" 
href="/w/index.php?title=%E1%89%B0%E1%8B%8B%E1%89%A0%E1%89%BD_%E1%8A%A0%E1%88%8A&#38;action=edit&#38;redlink=1"
 title="ተዋበች አሊ (ገጹ ገና አልተጻፈም)">ተዋበች አሊ</a>ን ተዳሩ፤ እንዲሁም በ<a class="cx-link" 
data-linkid="19" 
href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&#38;action=edit&#38;redlink=1"
 title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a>ነት ማዕረግ የቋራ አስተዳዳሪ ሆነው ተሾሙ። </span><span 
class="cx-segment" data-segmentid="20">ቴዎድሮስ ግን በ<a class="cx-link" 
data-linkid="21" href="/w/index.php?title=1844&#38;action=edit&#38;redlink=1" 
title="1844 (ገጹ ገና አልተጻፈም)">፲፰፻፵፬</a> ዓ.ም. አጠቃላይ የዘመነ መሳፍንት ሥርዓትን በመቃወም በሰሜናዊ 
ባላባቶች ላይ ዘመቻ ጀመሩ። </span><span class="cx-segment" data-segmentid="22">በኒህ ተከታታይ 
ዘመቻወች የገጠሟቸውን ባላባቶች ስላሸነፉ፣ መጀመሪያ የ<a class="cx-link" data-linkid="23" 
href="/wiki/%E1%88%AB%E1%88%B5" title="ራስ">ራስ</a> ማዕረግን በኋላም የ<a 
class="cx-link" data-linkid="24" href="/wiki/%E1%8A%95%E1%8C%89%E1%88%A5" 
title="ንጉሥ">ንጉሥ</a> ማዕረግን በአንድ ዓመት ውስጥ ተቀዳጁ። </span><span class="cx-segment" 
data-segmentid="25">በየጊዜው በሚያደርጉት የተሳካ ዘመቻ የዘመኑን ባላባቶች ኃይል በመሰባበር <a 
class="cx-link" data-linkid="26" 
href="/wiki/%E1%8B%A8%E1%8A%AB%E1%89%B2%E1%89%B5_3" title="የካቲት 3">የካቲት ፫</a> 
ቀን <a class="cx-link" data-linkid="27" 
href="/w/index.php?title=1847&#38;action=edit&#38;redlink=1" title="1847 (ገጹ ገና 
አልተጻፈም)">፲፰፻፵፯</a> ዓ.ም ንጉሥ ካሳ - ዳግማዊ ዓፄ ቴዎድሮስ ተብለው የኢትዮጵያ ንጉሠ ነገሥት 
ሆኑ።</span></p>
diff --git a/tests/segmentation/data/test-23.html 
b/tests/segmentation/data/test-23.html
new file mode 100644
index 0000000..6979b02
--- /dev/null
+++ b/tests/segmentation/data/test-23.html
@@ -0,0 +1 @@
+<p>ቴዎድሮስ <a href="/wiki/%E1%8C%A5%E1%88%AD_%E1%8D%AE" title="ጥር ፮">ጥር ፮</a> ቀን 
<a href="/w/index.php?title=1811&amp;action=edit&amp;redlink=1" class="new" 
title="1811 (ገጹ ገና አልተጻፈም)">፲፰፻፲፩</a> ዓ.ም. ሻርጌ በተባለ ቦታ <a 
href="/wiki/%E1%89%8B%E1%88%AB" title="ቋራ">ቋራ</a> ውስጥ፣ ከ<a 
href="/wiki/%E1%8C%8E%E1%8A%95%E1%8B%B0%E1%88%AD" title="ጎንደር" 
class="mw-redirect">ጎንደር ከተማ</a> በስተ ምዕራብ ተወለዱ። የተወለዱትም አገሪቷ በባላባቶች ተከፋፍላ 
በምትመራበት-<a 
href="/wiki/%E1%8B%98%E1%88%98%E1%8A%90_%E1%88%98%E1%88%B3%E1%8D%8D%E1%8A%95%E1%89%B5"
 title="ዘመነ መሳፍንት">ዘመነ መሳፍንት</a> በሚባለው ወቅት ነበር። አባታቸው <a 
href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&amp;action=edit&amp;redlink=1"
 class="new" title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a> <a 
href="/w/index.php?title=%E1%8A%83%E1%8B%AD%E1%88%89_%E1%8B%88%E1%88%8D%E1%8B%B0_%E1%8C%8A%E1%8B%AE%E1%88%AD%E1%8C%8A%E1%88%B5&amp;action=edit&amp;redlink=1"
 class="new" title="ኃይሉ ወልደ ጊዮርጊስ (ገጹ ገና አልተጻፈም)">ኃይሉ ወልደ ጊዮርጊስ</a> የቋራ ገዢ ነበሩ። 
ዓፄ ቴዎድሮስ በህጻንነታቸው የቄስ ትምህርት ከቀሰሙ በኋላ፣ የአጎታቸውንና በኋላም ለጥቂት ጊዜ የ<a 
href="/wiki/%E1%8C%8E%E1%8C%83%E1%88%9D" title="ጎጃም">ጎጃሙን</a> ጦር መሪ የ<a 
href="/w/index.php?title=%E1%8C%8E%E1%88%B9_%E1%8B%98%E1%8B%8D%E1%8B%B4&amp;action=edit&amp;redlink=1"
 class="new" title="ጎሹ ዘውዴ (ገጹ ገና አልተጻፈም)">ጎሹ ዘውዴ</a>ን ጦር ተቀላቀሉ። በዚሁ የውትድርና 
ዘመናቸው ከፍተኛ ችሎታን ማስመዝገብ ስለጀመሩና ዝናቸው ስለተስፋፋ በ<a 
href="/w/index.php?title=1839&amp;action=edit&amp;redlink=1" class="new" 
title="1839 (ገጹ ገና አልተጻፈም)">፲፰፻፴፱</a> ዓ.ም. በወይዘሮ <a 
href="/w/index.php?title=%E1%88%98%E1%8A%90%E1%8A%95_%E1%88%8A%E1%89%A0%E1%8A%95_%E1%8A%A0%E1%88%9D%E1%8B%B4&amp;action=edit&amp;redlink=1"
 class="new" title="መነን ሊበን አምዴ (ገጹ ገና አልተጻፈም)">መነን ሊበን አምዴ</a> አነሳሽነት የልጇን የራስ 
<a 
href="/w/index.php?title=%E1%8A%A0%E1%88%8A_%E1%8A%A0%E1%88%89%E1%88%8B&amp;action=edit&amp;redlink=1"
 class="new" title="አሊ አሉላ (ገጹ ገና አልተጻፈም)">አሊ አሉላ</a>ን ልጅ፣ <a 
href="/w/index.php?title=%E1%89%B0%E1%8B%8B%E1%89%A0%E1%89%BD_%E1%8A%A0%E1%88%8A&amp;action=edit&amp;redlink=1"
 class="new" title="ተዋበች አሊ (ገጹ ገና አልተጻፈም)">ተዋበች አሊ</a>ን ተዳሩ፤ እንዲሁም በ<a 
href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&amp;action=edit&amp;redlink=1"
 class="new" title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a>ነት ማዕረግ የቋራ አስተዳዳሪ ሆነው ተሾሙ። 
ቴዎድሮስ ግን በ<a href="/w/index.php?title=1844&amp;action=edit&amp;redlink=1" 
class="new" title="1844 (ገጹ ገና አልተጻፈም)">፲፰፻፵፬</a> ዓ.ም. አጠቃላይ የዘመነ መሳፍንት ሥርዓትን 
በመቃወም በሰሜናዊ ባላባቶች ላይ ዘመቻ ጀመሩ። በኒህ ተከታታይ ዘመቻወች የገጠሟቸውን ባላባቶች ስላሸነፉ፣ መጀመሪያ የ<a 
href="/wiki/%E1%88%AB%E1%88%B5" title="ራስ">ራስ</a> ማዕረግን በኋላም የ<a 
href="/wiki/%E1%8A%95%E1%8C%89%E1%88%A5" title="ንጉሥ">ንጉሥ</a> ማዕረግን በአንድ ዓመት ውስጥ 
ተቀዳጁ። በየጊዜው በሚያደርጉት የተሳካ ዘመቻ የዘመኑን ባላባቶች ኃይል በመሰባበር <a 
href="/wiki/%E1%8B%A8%E1%8A%AB%E1%89%B2%E1%89%B5_3" title="የካቲት 3" 
class="mw-redirect">የካቲት ፫</a> ቀን <a 
href="/w/index.php?title=1847&amp;action=edit&amp;redlink=1" class="new" 
title="1847 (ገጹ ገና አልተጻፈም)">፲፰፻፵፯</a> ዓ.ም ንጉሥ ካሳ - ዳግማዊ ዓፄ ቴዎድሮስ ተብለው የኢትዮጵያ 
ንጉሠ ነገሥት ሆኑ።</p>

-- 
To view, visit https://gerrit.wikimedia.org/r/211400
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3a47e630dcbaff8f5b9c60b1a4d48b3db7d5606d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Amire80 <amir.ahar...@mail.huji.ac.il>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to