Santhosh has uploaded a new change for review. https://gerrit.wikimedia.org/r/119969
Change subject: Segmentation: Handle all section types ...................................................................... Segmentation: Handle all section types Borrowed the definition of section types from VE. Made all tests passing Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f --- M segmentation/languages/CXParser.js M tests/segmentation/SegmentationTests.js M tests/segmentation/SegmentationTests.json 3 files changed, 43 insertions(+), 21 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/69/119969/1 diff --git a/segmentation/languages/CXParser.js b/segmentation/languages/CXParser.js index 60ca0a3..a4b445a 100644 --- a/segmentation/languages/CXParser.js +++ b/segmentation/languages/CXParser.js @@ -24,6 +24,21 @@ this.links = {}; }; +CXParser.prototype.sectionTypes = [ + 'div', 'p', + // tables + 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', + // lists + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + // HTML5 heading content + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', + // HTML5 sectioning content + 'article', 'aside', 'body', 'nav', 'section', 'footer', 'header', 'figure', + 'figcaption', 'fieldset', 'details', 'blockquote', + // other + 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', + 'map', 'object', 'pre', 'progress', 'video' + ]; /** * Error handler */ @@ -91,8 +106,8 @@ replacement = prevWord + sentenceSeperator; //console.log([match, prevWord, sentenceSeperator, offset]); - nextLetter = sentence[offset + match.length]; - if ( prevWord && prevWord.length < 3 && prevWord[0].toUpperCase() === prevWord[0] || + nextLetter = sentence[ offset + match.length ]; + if ( prevWord && prevWord.length < 3 && prevWord[ 0 ].toUpperCase() === prevWord[ 0 ] || nextLetter && nextLetter.toLowerCase() === nextLetter ) { // abbreviation? return replacement; @@ -118,7 +133,7 @@ if ( !this.inSentence ) { this.print( this.startSentence() ); } - this.links[this.segmentCount] = { + this.links[ this.segmentCount ] = { href: href }; this.print( ' class="cx-link" data-linkid="' + ( this.segmentCount++ ) + '"' ); @@ -130,8 +145,14 @@ */ CXParser.prototype.onopentag = function ( tag ) { var attrName, - section = /[ph1-6]|figure|ul|div/; + section = /[ph1-6]|figure|figcaption|ul|div/; + if ( this.sectionTypes.indexOf( tag.name ) >= 0 ) { + if ( this.inSentence ) { + // Avoid dangling sentence. + this.print( this.endSentence() ); + } + } if ( tag.name === 'a' && !this.inSentence ) { // sentences starting with a link this.print( this.startSentence() ); @@ -151,7 +172,7 @@ // not leaking it to the text. So ignore these attributes. continue; } - this.print( ' ' + attrName + '="' + entity( tag.attributes[attrName] ) + '"' ); + this.print( ' ' + attrName + '="' + entity( tag.attributes[ attrName ] ) + '"' ); } // Sections @@ -175,8 +196,7 @@ * @param {string} tag */ CXParser.prototype.onclosetag = function ( tag ) { - var section = /[ph1-6]|figure|ul|div/; - if ( tag.match( section ) ) { + if ( this.sectionTypes.indexOf( tag ) >= 0 ) { if ( this.inSentence ) { // Avoid dangling sentence. this.print( this.endSentence() ); diff --git a/tests/segmentation/SegmentationTests.js b/tests/segmentation/SegmentationTests.js index f695bc9..b20cede 100644 --- a/tests/segmentation/SegmentationTests.js +++ b/tests/segmentation/SegmentationTests.js @@ -1,19 +1,21 @@ 'use strict'; var assert = require( 'assert' ), - CXSegmenter = require( __dirname + '/../../segmentation/CXSegmenter.js' ).CXSegmenter, - tests = require( './SegmentationTests.json' ); + CXSegmenter = require( __dirname + '/../../segmentation/CXSegmenter.js' ).CXSegmenter, + tests = require( './SegmentationTests.json' ); for ( var lang in tests ) { - var languageTests = tests[lang]; - for ( var i in languageTests ) { - var test = languageTests[i], - segmenter; - segmenter = new CXSegmenter( test.source, lang ); - segmenter.segment(); - var result = segmenter.getSegmentedContent(); - result = result.replace( /(\r\n|\n|\t|\r)/gm, '' ); - console.log( test.result + '\n' + result ); - assert.equal( test.result, result ); - } + var languageTests = tests[ lang ]; + for ( var i in languageTests ) { + var test = languageTests[ i ], + segmenter; + segmenter = new CXSegmenter( test.source, lang ); + segmenter.segment(); + var result = segmenter.getSegmentedContent(); + result = result.replace( /(\r\n|\n|\t|\r)/gm, '' ); + console.log( "Test" + ':' + test.source ); + console.log( "Expected" + ':' + test.result ); + console.log( "Actual"+':' + result ); + assert.equal( test.result, result ); + } } diff --git a/tests/segmentation/SegmentationTests.json b/tests/segmentation/SegmentationTests.json index a65b30a..1b4c269 100644 --- a/tests/segmentation/SegmentationTests.json +++ b/tests/segmentation/SegmentationTests.json @@ -30,7 +30,7 @@ }, { "source": "<figure><a href=\"#\"><img src=\"img.png\"></a><figcaption>Figure caption</figcaption></figure>", - "result": "<figure id=\"0\"><a class=\"cx-link\" data-linkid=\"1\" href=\"#\"><img src=\"img.png\"></img></a><figcaption id=\"2\"><span class=\"cx-segment\" data-segmentid=\"3\">Figure caption</span></figcaption></figure>" + "result": "<figure id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\"><img src=\"img.png\"></img></a></span><figcaption id=\"3\"><span class=\"cx-segment\" data-segmentid=\"4\">Figure caption</span></figcaption></figure>" } ], "hi": [ -- To view, visit https://gerrit.wikimedia.org/r/119969 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits