jenkins-bot has submitted this change and it was merged. Change subject: Segmentation: Handle all section types ......................................................................
Segmentation: Handle all section types Borrowed the definition of section types from VE. Made all tests passing Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f --- M segmentation/languages/CXParser.js M tests/segmentation/SegmentationTests.js M tests/segmentation/SegmentationTests.json 3 files changed, 33 insertions(+), 11 deletions(-) Approvals: KartikMistry: Looks good to me, approved jenkins-bot: Verified diff --git a/segmentation/languages/CXParser.js b/segmentation/languages/CXParser.js index 60ca0a3..a4b445a 100644 --- a/segmentation/languages/CXParser.js +++ b/segmentation/languages/CXParser.js @@ -24,6 +24,21 @@ this.links = {}; }; +CXParser.prototype.sectionTypes = [ + 'div', 'p', + // tables + 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', + // lists + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + // HTML5 heading content + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', + // HTML5 sectioning content + 'article', 'aside', 'body', 'nav', 'section', 'footer', 'header', 'figure', + 'figcaption', 'fieldset', 'details', 'blockquote', + // other + 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', + 'map', 'object', 'pre', 'progress', 'video' + ]; /** * Error handler */ @@ -91,8 +106,8 @@ replacement = prevWord + sentenceSeperator; //console.log([match, prevWord, sentenceSeperator, offset]); - nextLetter = sentence[offset + match.length]; - if ( prevWord && prevWord.length < 3 && prevWord[0].toUpperCase() === prevWord[0] || + nextLetter = sentence[ offset + match.length ]; + if ( prevWord && prevWord.length < 3 && prevWord[ 0 ].toUpperCase() === prevWord[ 0 ] || nextLetter && nextLetter.toLowerCase() === nextLetter ) { // abbreviation? return replacement; @@ -118,7 +133,7 @@ if ( !this.inSentence ) { this.print( this.startSentence() ); } - this.links[this.segmentCount] = { + this.links[ this.segmentCount ] = { href: href }; this.print( ' class="cx-link" data-linkid="' + ( this.segmentCount++ ) + '"' ); @@ -130,8 +145,14 @@ */ CXParser.prototype.onopentag = function ( tag ) { var attrName, - section = /[ph1-6]|figure|ul|div/; + section = /[ph1-6]|figure|figcaption|ul|div/; + if ( this.sectionTypes.indexOf( tag.name ) >= 0 ) { + if ( this.inSentence ) { + // Avoid dangling sentence. + this.print( this.endSentence() ); + } + } if ( tag.name === 'a' && !this.inSentence ) { // sentences starting with a link this.print( this.startSentence() ); @@ -151,7 +172,7 @@ // not leaking it to the text. So ignore these attributes. continue; } - this.print( ' ' + attrName + '="' + entity( tag.attributes[attrName] ) + '"' ); + this.print( ' ' + attrName + '="' + entity( tag.attributes[ attrName ] ) + '"' ); } // Sections @@ -175,8 +196,7 @@ * @param {string} tag */ CXParser.prototype.onclosetag = function ( tag ) { - var section = /[ph1-6]|figure|ul|div/; - if ( tag.match( section ) ) { + if ( this.sectionTypes.indexOf( tag ) >= 0 ) { if ( this.inSentence ) { // Avoid dangling sentence. this.print( this.endSentence() ); diff --git a/tests/segmentation/SegmentationTests.js b/tests/segmentation/SegmentationTests.js index f695bc9..51fba68 100644 --- a/tests/segmentation/SegmentationTests.js +++ b/tests/segmentation/SegmentationTests.js @@ -5,15 +5,17 @@ tests = require( './SegmentationTests.json' ); for ( var lang in tests ) { - var languageTests = tests[lang]; + var languageTests = tests[ lang ]; for ( var i in languageTests ) { - var test = languageTests[i], + var test = languageTests[ i ], segmenter; segmenter = new CXSegmenter( test.source, lang ); segmenter.segment(); var result = segmenter.getSegmentedContent(); result = result.replace( /(\r\n|\n|\t|\r)/gm, '' ); - console.log( test.result + '\n' + result ); + console.log( 'Test' + ': ' + test.source ); + console.log( 'Expected' + ': ' + test.result ); + console.log( 'Actual' + ': ' + result ); assert.equal( test.result, result ); } } diff --git a/tests/segmentation/SegmentationTests.json b/tests/segmentation/SegmentationTests.json index a65b30a..1b4c269 100644 --- a/tests/segmentation/SegmentationTests.json +++ b/tests/segmentation/SegmentationTests.json @@ -30,7 +30,7 @@ }, { "source": "<figure><a href=\"#\"><img src=\"img.png\"></a><figcaption>Figure caption</figcaption></figure>", - "result": "<figure id=\"0\"><a class=\"cx-link\" data-linkid=\"1\" href=\"#\"><img src=\"img.png\"></img></a><figcaption id=\"2\"><span class=\"cx-segment\" data-segmentid=\"3\">Figure caption</span></figcaption></figure>" + "result": "<figure id=\"0\"><span class=\"cx-segment\" data-segmentid=\"1\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\"><img src=\"img.png\"></img></a></span><figcaption id=\"3\"><span class=\"cx-segment\" data-segmentid=\"4\">Figure caption</span></figcaption></figure>" } ], "hi": [ -- To view, visit https://gerrit.wikimedia.org/r/119969 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> Gerrit-Reviewer: KartikMistry <kartik.mis...@gmail.com> Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits