Santhosh has uploaded a new change for review.
https://gerrit.wikimedia.org/r/119969
Change subject: Segmentation: Handle all section types
......................................................................
Segmentation: Handle all section types
Borrowed the definition of section types from VE.
Made all tests passing
Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f
---
M segmentation/languages/CXParser.js
M tests/segmentation/SegmentationTests.js
M tests/segmentation/SegmentationTests.json
3 files changed, 43 insertions(+), 21 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver
refs/changes/69/119969/1
diff --git a/segmentation/languages/CXParser.js
b/segmentation/languages/CXParser.js
index 60ca0a3..a4b445a 100644
--- a/segmentation/languages/CXParser.js
+++ b/segmentation/languages/CXParser.js
@@ -24,6 +24,21 @@
this.links = {};
};
+CXParser.prototype.sectionTypes = [
+ 'div', 'p',
+ // tables
+ 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td',
+ // lists
+ 'ul', 'ol', 'li', 'dl', 'dt', 'dd',
+ // HTML5 heading content
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup',
+ // HTML5 sectioning content
+ 'article', 'aside', 'body', 'nav', 'section', 'footer', 'header',
'figure',
+ 'figcaption', 'fieldset', 'details', 'blockquote',
+ // other
+ 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed',
+ 'map', 'object', 'pre', 'progress', 'video'
+ ];
/**
* Error handler
*/
@@ -91,8 +106,8 @@
replacement = prevWord + sentenceSeperator;
//console.log([match, prevWord, sentenceSeperator, offset]);
- nextLetter = sentence[offset + match.length];
- if ( prevWord && prevWord.length < 3 &&
prevWord[0].toUpperCase() === prevWord[0] ||
+ nextLetter = sentence[ offset + match.length ];
+ if ( prevWord && prevWord.length < 3 && prevWord[ 0
].toUpperCase() === prevWord[ 0 ] ||
nextLetter && nextLetter.toLowerCase() === nextLetter )
{
// abbreviation?
return replacement;
@@ -118,7 +133,7 @@
if ( !this.inSentence ) {
this.print( this.startSentence() );
}
- this.links[this.segmentCount] = {
+ this.links[ this.segmentCount ] = {
href: href
};
this.print( ' class="cx-link" data-linkid="' + ( this.segmentCount++ )
+ '"' );
@@ -130,8 +145,14 @@
*/
CXParser.prototype.onopentag = function ( tag ) {
var attrName,
- section = /[ph1-6]|figure|ul|div/;
+ section = /[ph1-6]|figure|figcaption|ul|div/;
+ if ( this.sectionTypes.indexOf( tag.name ) >= 0 ) {
+ if ( this.inSentence ) {
+ // Avoid dangling sentence.
+ this.print( this.endSentence() );
+ }
+ }
if ( tag.name === 'a' && !this.inSentence ) {
// sentences starting with a link
this.print( this.startSentence() );
@@ -151,7 +172,7 @@
// not leaking it to the text. So ignore these
attributes.
continue;
}
- this.print( ' ' + attrName + '="' + entity(
tag.attributes[attrName] ) + '"' );
+ this.print( ' ' + attrName + '="' + entity( tag.attributes[
attrName ] ) + '"' );
}
// Sections
@@ -175,8 +196,7 @@
* @param {string} tag
*/
CXParser.prototype.onclosetag = function ( tag ) {
- var section = /[ph1-6]|figure|ul|div/;
- if ( tag.match( section ) ) {
+ if ( this.sectionTypes.indexOf( tag ) >= 0 ) {
if ( this.inSentence ) {
// Avoid dangling sentence.
this.print( this.endSentence() );
diff --git a/tests/segmentation/SegmentationTests.js
b/tests/segmentation/SegmentationTests.js
index f695bc9..b20cede 100644
--- a/tests/segmentation/SegmentationTests.js
+++ b/tests/segmentation/SegmentationTests.js
@@ -1,19 +1,21 @@
'use strict';
var assert = require( 'assert' ),
- CXSegmenter = require( __dirname + '/../../segmentation/CXSegmenter.js'
).CXSegmenter,
- tests = require( './SegmentationTests.json' );
+ CXSegmenter = require( __dirname +
'/../../segmentation/CXSegmenter.js' ).CXSegmenter,
+ tests = require( './SegmentationTests.json' );
for ( var lang in tests ) {
- var languageTests = tests[lang];
- for ( var i in languageTests ) {
- var test = languageTests[i],
- segmenter;
- segmenter = new CXSegmenter( test.source, lang );
- segmenter.segment();
- var result = segmenter.getSegmentedContent();
- result = result.replace( /(\r\n|\n|\t|\r)/gm, '' );
- console.log( test.result + '\n' + result );
- assert.equal( test.result, result );
- }
+ var languageTests = tests[ lang ];
+ for ( var i in languageTests ) {
+ var test = languageTests[ i ],
+ segmenter;
+ segmenter = new CXSegmenter( test.source, lang );
+ segmenter.segment();
+ var result = segmenter.getSegmentedContent();
+ result = result.replace( /(\r\n|\n|\t|\r)/gm, '' );
+ console.log( "Test" + ':' + test.source );
+ console.log( "Expected" + ':' + test.result );
+ console.log( "Actual"+':' + result );
+ assert.equal( test.result, result );
+ }
}
diff --git a/tests/segmentation/SegmentationTests.json
b/tests/segmentation/SegmentationTests.json
index a65b30a..1b4c269 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -30,7 +30,7 @@
},
{
"source": "<figure><a href=\"#\"><img
src=\"img.png\"></a><figcaption>Figure caption</figcaption></figure>",
- "result": "<figure id=\"0\"><a class=\"cx-link\"
data-linkid=\"1\" href=\"#\"><img src=\"img.png\"></img></a><figcaption
id=\"2\"><span class=\"cx-segment\" data-segmentid=\"3\">Figure
caption</span></figcaption></figure>"
+ "result": "<figure id=\"0\"><span class=\"cx-segment\"
data-segmentid=\"1\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\"><img
src=\"img.png\"></img></a></span><figcaption id=\"3\"><span
class=\"cx-segment\" data-segmentid=\"4\">Figure
caption</span></figcaption></figure>"
}
],
"hi": [
--
To view, visit https://gerrit.wikimedia.org/r/119969
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits