Divec has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/118690

Change subject: WIP: Use ve.dm linear model for segmentation
......................................................................

WIP: Use ve.dm linear model for segmentation

Use VisualEditor linear model to ensure balanced HTML when performing
sentence segmentation. This allows sentences to be manipulated more easily
in an HTML DOM (wrapped in HTML span tags etc).

Change-Id: I17f33d8db5a889d076619267316f2676e3255cb8
TODO: Include ve properly
TODO: Language-specific subclasses
---
M server/segmentation/SentenceSegmenter.js
1 file changed, 50 insertions(+), 9 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/ContentTranslation 
refs/changes/90/118690/1

diff --git a/server/segmentation/SentenceSegmenter.js 
b/server/segmentation/SentenceSegmenter.js
index 1362e02..7b91764 100644
--- a/server/segmentation/SentenceSegmenter.js
+++ b/server/segmentation/SentenceSegmenter.js
@@ -13,6 +13,7 @@
        crypto = require( 'crypto' ),
        Segmenter = require( __dirname + '/Segmenter.js' ).Segmenter,
        LinkSegmenter = require( __dirname + '/LinkSegmenter.js' 
).LinkSegmenter,
+       ve = require( 've' ).ve, // TODO: do this properly
        $ = require( 'jquery' );
 
 function SentenceSegmenter( content ) {
@@ -25,20 +26,42 @@
 util.inherits( SentenceSegmenter, Segmenter );
 
 SentenceSegmenter.prototype.segment = function () {
-       var i, segmentId, linkSegmenter,
-               sentences = this.content.split( '. ' );
+       var i, len, segmentId, linkSegmenter, linearModel, data, chars, 
sentences, newSentences;
+
+       linearModel = ve.dm.converter.getModelFromDom( 
ve.createDocumentFromHtml( this.content ) );
+       data = linearModel.data.data;
+       chars = [];
+       sentences = [];
+       for ( i = 0, len = data.length; i < len; i++ ) {
+               if ( typeof data[i] === 'string' ) {
+                       // like 'x'
+                       chars.append( data[i] );
+               } else if ( typeof data[i].length === 'number' ) {
+                       // like ['x', [0]]
+                       chars.append( data[i][0] );
+               } else {
+                       // A branch start/end
+                       if ( chars.length === 0 ) {
+                               continue;
+                       }
+                       newSentences = this.getBalancedHtmlSentences(
+                               i - chars.length,
+                               chars,
+                               linearModel
+                       );
+                       sentences.concat( newSentences );
+                       chars.length = 0;
+               }
+       }
 
        this.segmentCount += sentences.length;
-       for ( i = 0; i< this.segmentCount; i++ ) {
-               segmentId = crypto.createHash( 'md5' ).update( sentences[i] )
+       for ( i = 0; i < this.segmentCount; i++ ) {
+               segmentId = crypto.createHash( 'md5' ).update( 
sentences[i].text )
                        .digest( 'hex' ).substr( 0, 5 );
-               linkSegmenter = new LinkSegmenter( sentences[i] );
+               linkSegmenter = new LinkSegmenter( sentences[i].text );
                linkSegmenter.segment();
                this.links = $.extend( this.links, linkSegmenter.getSegments() 
);
-               this.segments[segmentId] = {
-                       source: linkSegmenter.toHTML() +
-                               ( ( i+1 !== this.segmentCount )? '. ': '.' ) // 
XXX
-               };
+               this.segments[segmentId] = { source: linkSegmenter.toHTML() };
                // We need this lookup to keep the order of segments
                // while constructing the segmented content using
                // toHTML method.
@@ -46,6 +69,24 @@
        }
 };
 
+SentenceSegmenter.getSentenceHtmlFragments = function ( startOffset, chars, 
linearModel ) {
+       var i, len, range, sliceModel,
+               htmlFragments = [],
+               chunks = this.splitEn( chars.join( '' ) ),
+               offset = startOffset;
+
+       for ( i = 0, len = chunks.length; i < len; i++ ) {
+               if ( chunks[i].length === 0 ) {
+                       continue;
+               }
+               range = new ve.Range( offset, offset + chunks[i].length );
+               sliceModel = linearModel.cloneSliceFromRange( range );
+               htmlFragments.push( ve.dm.converter.getModelFromDom( sliceModel 
).innerHTML );
+               offset += chunks[i].length;
+       }
+       return htmlFragments;
+};
+
 SentenceSegmenter.prototype.getLinks = function () {
        return this.links;
 };

-- 
To view, visit https://gerrit.wikimedia.org/r/118690
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I17f33d8db5a889d076619267316f2676e3255cb8
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Divec <da...@sheetmusic.org.uk>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to