Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/119969

Change subject: Segmentation: Handle all section types
......................................................................

Segmentation: Handle all section types

Borrowed the definition of section types from VE.
Made all tests passing

Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f
---
M segmentation/languages/CXParser.js
M tests/segmentation/SegmentationTests.js
M tests/segmentation/SegmentationTests.json
3 files changed, 43 insertions(+), 21 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/69/119969/1

diff --git a/segmentation/languages/CXParser.js 
b/segmentation/languages/CXParser.js
index 60ca0a3..a4b445a 100644
--- a/segmentation/languages/CXParser.js
+++ b/segmentation/languages/CXParser.js
@@ -24,6 +24,21 @@
        this.links = {};
 };
 
+CXParser.prototype.sectionTypes = [
+       'div', 'p',
+       // tables
+       'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td',
+       // lists
+       'ul', 'ol', 'li', 'dl', 'dt', 'dd',
+       // HTML5 heading content
+       'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup',
+       // HTML5 sectioning content
+       'article', 'aside', 'body', 'nav', 'section', 'footer', 'header', 
'figure',
+       'figcaption', 'fieldset', 'details', 'blockquote',
+       // other
+       'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed',
+       'map', 'object', 'pre', 'progress', 'video'
+ ];
 /**
  * Error handler
  */
@@ -91,8 +106,8 @@
 
                replacement = prevWord + sentenceSeperator;
                //console.log([match, prevWord, sentenceSeperator, offset]);
-               nextLetter = sentence[offset + match.length];
-               if ( prevWord && prevWord.length < 3 && 
prevWord[0].toUpperCase() === prevWord[0] ||
+               nextLetter = sentence[ offset + match.length ];
+               if ( prevWord && prevWord.length < 3 && prevWord[ 0 
].toUpperCase() === prevWord[ 0 ] ||
                        nextLetter && nextLetter.toLowerCase() === nextLetter ) 
{
                        // abbreviation?
                        return replacement;
@@ -118,7 +133,7 @@
        if ( !this.inSentence ) {
                this.print( this.startSentence() );
        }
-       this.links[this.segmentCount] = {
+       this.links[ this.segmentCount ] = {
                href: href
        };
        this.print( ' class="cx-link" data-linkid="' + ( this.segmentCount++ ) 
+ '"' );
@@ -130,8 +145,14 @@
  */
 CXParser.prototype.onopentag = function ( tag ) {
        var attrName,
-               section = /[ph1-6]|figure|ul|div/;
+               section = /[ph1-6]|figure|figcaption|ul|div/;
 
+       if ( this.sectionTypes.indexOf( tag.name ) >= 0 ) {
+               if ( this.inSentence ) {
+                       // Avoid dangling sentence.
+                       this.print( this.endSentence() );
+               }
+       }
        if ( tag.name === 'a' && !this.inSentence ) {
                // sentences starting with a link
                this.print( this.startSentence() );
@@ -151,7 +172,7 @@
                        // not leaking it to the text. So ignore these 
attributes.
                        continue;
                }
-               this.print( ' ' + attrName + '="' + entity( 
tag.attributes[attrName] ) + '"' );
+               this.print( ' ' + attrName + '="' + entity( tag.attributes[ 
attrName ] ) + '"' );
        }
 
        // Sections
@@ -175,8 +196,7 @@
  * @param {string} tag
  */
 CXParser.prototype.onclosetag = function ( tag ) {
-       var section = /[ph1-6]|figure|ul|div/;
-       if ( tag.match( section ) ) {
+       if ( this.sectionTypes.indexOf( tag ) >= 0 ) {
                if ( this.inSentence ) {
                        // Avoid dangling sentence.
                        this.print( this.endSentence() );
diff --git a/tests/segmentation/SegmentationTests.js 
b/tests/segmentation/SegmentationTests.js
index f695bc9..b20cede 100644
--- a/tests/segmentation/SegmentationTests.js
+++ b/tests/segmentation/SegmentationTests.js
@@ -1,19 +1,21 @@
 'use strict';
 
 var assert = require( 'assert' ),
-       CXSegmenter = require( __dirname + '/../../segmentation/CXSegmenter.js' 
).CXSegmenter,
-       tests = require( './SegmentationTests.json' );
+        CXSegmenter = require( __dirname + 
'/../../segmentation/CXSegmenter.js' ).CXSegmenter,
+        tests = require( './SegmentationTests.json' );
 
 for ( var lang in tests ) {
-       var languageTests = tests[lang];
-       for ( var i in languageTests ) {
-               var test = languageTests[i],
-                       segmenter;
-               segmenter = new CXSegmenter( test.source, lang );
-               segmenter.segment();
-               var result = segmenter.getSegmentedContent();
-               result = result.replace( /(\r\n|\n|\t|\r)/gm, '' );
-               console.log( test.result + '\n' + result );
-               assert.equal( test.result, result );
-       }
+        var languageTests = tests[ lang ];
+        for ( var i in languageTests ) {
+                var test = languageTests[ i ],
+                        segmenter;
+                segmenter = new CXSegmenter( test.source, lang );
+                segmenter.segment();
+                var result = segmenter.getSegmentedContent();
+                result = result.replace( /(\r\n|\n|\t|\r)/gm, '' );
+                console.log( "Test" + ':' + test.source );
+                console.log( "Expected" + ':' + test.result );
+                console.log( "Actual"+':' + result );
+                assert.equal( test.result, result );
+        }
 }
diff --git a/tests/segmentation/SegmentationTests.json 
b/tests/segmentation/SegmentationTests.json
index a65b30a..1b4c269 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -30,7 +30,7 @@
                },
                {
                        "source": "<figure><a href=\"#\"><img 
src=\"img.png\"></a><figcaption>Figure caption</figcaption></figure>",
-                       "result": "<figure id=\"0\"><a class=\"cx-link\" 
data-linkid=\"1\" href=\"#\"><img src=\"img.png\"></img></a><figcaption 
id=\"2\"><span class=\"cx-segment\" data-segmentid=\"3\">Figure 
caption</span></figcaption></figure>"
+                       "result": "<figure id=\"0\"><span class=\"cx-segment\" 
data-segmentid=\"1\"><a class=\"cx-link\" data-linkid=\"2\" href=\"#\"><img 
src=\"img.png\"></img></a></span><figcaption id=\"3\"><span 
class=\"cx-segment\" data-segmentid=\"4\">Figure 
caption</span></figcaption></figure>"
                }
        ],
        "hi": [

-- 
To view, visit https://gerrit.wikimedia.org/r/119969
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If1124892ae8ac3a5477e228c061fb51ddf0b442f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to