Divec has uploaded a new change for review. https://gerrit.wikimedia.org/r/118690
Change subject: WIP: Use ve.dm linear model for segmentation ...................................................................... WIP: Use ve.dm linear model for segmentation Use VisualEditor linear model to ensure balanced HTML when performing sentence segmentation. This allows sentences to be manipulated more easily in an HTML DOM (wrapped in HTML span tags etc). Change-Id: I17f33d8db5a889d076619267316f2676e3255cb8 TODO: Include ve properly TODO: Language-specific subclasses --- M server/segmentation/SentenceSegmenter.js 1 file changed, 50 insertions(+), 9 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/ContentTranslation refs/changes/90/118690/1 diff --git a/server/segmentation/SentenceSegmenter.js b/server/segmentation/SentenceSegmenter.js index 1362e02..7b91764 100644 --- a/server/segmentation/SentenceSegmenter.js +++ b/server/segmentation/SentenceSegmenter.js @@ -13,6 +13,7 @@ crypto = require( 'crypto' ), Segmenter = require( __dirname + '/Segmenter.js' ).Segmenter, LinkSegmenter = require( __dirname + '/LinkSegmenter.js' ).LinkSegmenter, + ve = require( 've' ).ve, // TODO: do this properly $ = require( 'jquery' ); function SentenceSegmenter( content ) { @@ -25,20 +26,42 @@ util.inherits( SentenceSegmenter, Segmenter ); SentenceSegmenter.prototype.segment = function () { - var i, segmentId, linkSegmenter, - sentences = this.content.split( '. ' ); + var i, len, segmentId, linkSegmenter, linearModel, data, chars, sentences, newSentences; + + linearModel = ve.dm.converter.getModelFromDom( ve.createDocumentFromHtml( this.content ) ); + data = linearModel.data.data; + chars = []; + sentences = []; + for ( i = 0, len = data.length; i < len; i++ ) { + if ( typeof data[i] === 'string' ) { + // like 'x' + chars.append( data[i] ); + } else if ( typeof data[i].length === 'number' ) { + // like ['x', [0]] + chars.append( data[i][0] ); + } else { + // A branch start/end + if ( chars.length === 0 ) { + continue; + } + newSentences = this.getBalancedHtmlSentences( + i - chars.length, + chars, + linearModel + ); + sentences.concat( newSentences ); + chars.length = 0; + } + } this.segmentCount += sentences.length; - for ( i = 0; i< this.segmentCount; i++ ) { - segmentId = crypto.createHash( 'md5' ).update( sentences[i] ) + for ( i = 0; i < this.segmentCount; i++ ) { + segmentId = crypto.createHash( 'md5' ).update( sentences[i].text ) .digest( 'hex' ).substr( 0, 5 ); - linkSegmenter = new LinkSegmenter( sentences[i] ); + linkSegmenter = new LinkSegmenter( sentences[i].text ); linkSegmenter.segment(); this.links = $.extend( this.links, linkSegmenter.getSegments() ); - this.segments[segmentId] = { - source: linkSegmenter.toHTML() + - ( ( i+1 !== this.segmentCount )? '. ': '.' ) // XXX - }; + this.segments[segmentId] = { source: linkSegmenter.toHTML() }; // We need this lookup to keep the order of segments // while constructing the segmented content using // toHTML method. @@ -46,6 +69,24 @@ } }; +SentenceSegmenter.getSentenceHtmlFragments = function ( startOffset, chars, linearModel ) { + var i, len, range, sliceModel, + htmlFragments = [], + chunks = this.splitEn( chars.join( '' ) ), + offset = startOffset; + + for ( i = 0, len = chunks.length; i < len; i++ ) { + if ( chunks[i].length === 0 ) { + continue; + } + range = new ve.Range( offset, offset + chunks[i].length ); + sliceModel = linearModel.cloneSliceFromRange( range ); + htmlFragments.push( ve.dm.converter.getModelFromDom( sliceModel ).innerHTML ); + offset += chunks[i].length; + } + return htmlFragments; +}; + SentenceSegmenter.prototype.getLinks = function () { return this.links; }; -- To view, visit https://gerrit.wikimedia.org/r/118690 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I17f33d8db5a889d076619267316f2676e3255cb8 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/ContentTranslation Gerrit-Branch: master Gerrit-Owner: Divec <da...@sheetmusic.org.uk> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits