Santhosh has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392790 )

Change subject: Section wrapping at cxserver for v2 page fetch api
......................................................................

Section wrapping at cxserver for v2 page fetch api

v2 page fetch api will wrap the translatable sections
with <section> tag. Immediate childrens under <body>
will be considered as translatable sections.

Bug: T177752

Change-Id: Ibb5937061c6980579d35cd24a0ba8205b109f8c7
---
M bin/segment
M lib/lineardoc/Doc.js
M lib/lineardoc/MwContextualizer.js
M lib/lineardoc/Parser.js
M lib/mw/MWPageLoader.js
M lib/routes/v2.js
A test/mw/SectionWrap.test.js
M test/segmentation/data/result-18.html
M test/segmentation/data/result-4.html
M test/segmentation/data/test-18.html
10 files changed, 108 insertions(+), 24 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/90/392790/1

diff --git a/bin/segment b/bin/segment
index 2a18675..c23f7b0 100755
--- a/bin/segment
+++ b/bin/segment
@@ -11,7 +11,9 @@
 }
 
 function getParsedDoc( content ) {
-       const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer() );
+       const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer(), {
+               wrapSections: true
+       } );
        parser.init();
        parser.write( content );
        return parser.builder.doc;
diff --git a/lib/lineardoc/Doc.js b/lib/lineardoc/Doc.js
index 6c4047d..30d2909 100644
--- a/lib/lineardoc/Doc.js
+++ b/lib/lineardoc/Doc.js
@@ -77,10 +77,14 @@
 Doc.prototype.segment = function ( getBoundaries ) {
        var i, len, item, tag, textBlock, hash,
                newDoc = new Doc(),
+               nextSectionId = 0,
                nextId = 0;
 
        // TODO: return different counters depending on type
-       function getNextId( type ) {
+       function getNextId( type, tagName ) {
+               if ( tagName === 'section' ) {
+                       return String( 'cxSourceSection' + nextSectionId++ );
+               }
                if ( type === 'segment' || type === 'link' || type === 'block' 
) {
                        return String( nextId++ );
                } else {
@@ -115,7 +119,7 @@
                                        ).substr( 0, 30 );
                                }
                        } else {
-                               tag.attributes.id = getNextId( 'block' );
+                               tag.attributes.id = getNextId( 'block', 
tag.name );
                        }
                        newDoc.addItem( item.type, tag );
                } else if ( this.items[ i ].type !== 'textblock' ) {
diff --git a/lib/lineardoc/MwContextualizer.js 
b/lib/lineardoc/MwContextualizer.js
index c5d3742..d964443 100644
--- a/lib/lineardoc/MwContextualizer.js
+++ b/lib/lineardoc/MwContextualizer.js
@@ -1,7 +1,7 @@
 'use strict';
 
 const Contextualizer = require( './Contextualizer' );
-const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 
'h6', 'p', 'pre' ];
+const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 
'h6', 'p', 'pre', 'div', 'table', 'ol', 'li' ];
 
 /**
  * Contextualizer for MediaWiki DOM HTML
@@ -31,13 +31,18 @@
                        return 'media';
                }
 
+               // Otherwise, figure is media
+               if ( context === undefined && tag.name === 'body' ) {
+                       return 'section';
+               }
+
                // And figure//figcaption is contentBranch
                if ( context === 'media' && tag.name === 'figcaption' ) {
                        return 'contentBranch';
                }
 
                // And ContentBranchNodes are contentBranch
-               if ( context === undefined && contentBranchNodeNames.indexOf( 
tag.name ) > -1 ) {
+               if ( ( context === 'section' || context === undefined ) && 
contentBranchNodeNames.indexOf( tag.name ) > -1 ) {
                        return 'contentBranch';
                }
 
diff --git a/lib/lineardoc/Parser.js b/lib/lineardoc/Parser.js
index d2410da..5291228 100644
--- a/lib/lineardoc/Parser.js
+++ b/lib/lineardoc/Parser.js
@@ -57,7 +57,7 @@
        this.builder = this.rootBuilder;
        // Stack of tags currently open
        this.allTags = [];
-       // context for each tag currently open; 
undefined|'verbatim'|'media'|'contentBranch'
+       // context for each tag currently open; 
undefined|'verbatim'|'media'|'contentBranch'|'section'
        this.contexts = [];
 };
 
@@ -78,6 +78,14 @@
        } else if ( this.isInlineAnnotationTag( tag.name ) ) {
                this.builder.pushInlineAnnotationTag( tag );
        } else {
+               if ( this.options.wrapSections && 
this.contextualizer.getContext() === 'section' ) {
+                       this.builder.pushBlockTag( {
+                               name: 'section',
+                               attributes: {
+                                       rel: 'cx:Section'
+                               }
+                       } );
+               }
                this.builder.pushBlockTag( tag );
        }
        this.allTags.push( tag );
@@ -106,6 +114,9 @@
                this.builder = this.builder.parent;
        } else if ( !isAnn ) {
                this.builder.popBlockTag( tagName );
+               if ( this.options.wrapSections && 
this.contextualizer.getContext() === 'section' ) {
+                       this.builder.popBlockTag( 'section' );
+               }
        } else {
                throw new Error( 'Unexpected close tag: ' + tagName );
        }
diff --git a/lib/mw/MWPageLoader.js b/lib/mw/MWPageLoader.js
index e9c5941..14597bb 100644
--- a/lib/mw/MWPageLoader.js
+++ b/lib/mw/MWPageLoader.js
@@ -5,11 +5,17 @@
        CXSegmenter = require( '../segmentation/CXSegmenter' );
 
 class MWPageLoader extends ApiRequest {
-       getPage( page, revision ) {
+       /**
+        *
+        * @param {string} page The page title
+        * @param {string} revision The revision id
+        * @param {boolean} wrapSections Whether translatable sections should 
be wrapped in <section> tag
+        * @return {Promise}
+        */
+       getPage( page, revision, wrapSections ) {
                return this.fetch( page, revision ).then( ( response ) => {
-                       const parsedDoc = this.getParsedDoc( response.body );
-                       const segmentedDoc = new CXSegmenter().segment( 
parsedDoc, this.sourceLanguage );
-                       // TODO: segmentedDoc.wrapSections();
+                       const parsedDoc = this.getParsedDoc( response.body, 
wrapSections );
+                       let segmentedDoc = new CXSegmenter().segment( 
parsedDoc, this.sourceLanguage );
                        return {
                                content: segmentedDoc.getHtml(),
                                revision: response.revision
@@ -17,8 +23,16 @@
                } );
        }
 
-       getParsedDoc( content ) {
-               const parser = new LinearDoc.Parser( new 
LinearDoc.MwContextualizer() );
+       /**
+        *
+        * @param {string} content
+        * @param {boolean} wrapSections Whether translatable sections should 
be wrapped in <section> tag
+        * @return {Object}
+        */
+       getParsedDoc( content, wrapSections ) {
+               const parser = new LinearDoc.Parser( new 
LinearDoc.MwContextualizer(), {
+                       wrapSections
+               } );
                parser.init();
                parser.write( content );
                return parser.builder.doc;
diff --git a/lib/routes/v2.js b/lib/routes/v2.js
index 188d4ec..00aa567 100644
--- a/lib/routes/v2.js
+++ b/lib/routes/v2.js
@@ -45,7 +45,7 @@
                } );
 
                this.app.logger.log( 'debug', `Getting page 
${sourceLanguage}:${title}` );
-               return pageLoader.getPage( title, revision ).then(
+               return pageLoader.getPage( title, revision, true /* 
wrapSections */ ).then(
                        ( response ) => {
                                res.send( {
                                        sourceLanguage,
diff --git a/test/mw/SectionWrap.test.js b/test/mw/SectionWrap.test.js
new file mode 100644
index 0000000..ac0b2a2
--- /dev/null
+++ b/test/mw/SectionWrap.test.js
@@ -0,0 +1,50 @@
+'use strict';
+
+const assert = require( '../utils/assert.js' ),
+       LinearDoc = require( '../../lib/lineardoc' );
+
+function normalize( html ) {
+       var normalizer = new LinearDoc.Normalizer();
+       normalizer.init();
+       normalizer.write( html.replace( /(\r\n|\n|\t|\r)/gm, '' ) );
+       return normalizer.getHtml();
+}
+
+function getParsedDoc( content ) {
+       const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer(), {
+               wrapSections: true
+       } );
+       parser.init();
+       parser.write( content );
+       return parser.builder.doc;
+}
+
+/* eslint-disable no-multi-str */
+const sourceHTML = `<body>
+    <p>Paragraph <b>bold</b> <a href="/wiki/Title">Title</a>.</p>
+    <h3>Heading</h3>
+    <table><tr><td>data</td></tr></table>
+    <div>Content<div>innerdiv</div></div>
+    <p>Content<div>Div in paragraph</div></p>
+    <ol><li>Item</li><li>Item</li></ol>
+    <link href="./Category:Oxygen#%20" id="mwCKQ" rel="mw:PageProp/Category" />
+    </body>`;
+
+const expectedSectionWrappedHTML = `<body>
+    <section rel="cx:Section"><p>Paragraph <b>bold</b> <a 
href="/wiki/Title">Title</a>.</p></section>
+    <section rel="cx:Section"><h3>Heading</h3></section>
+    <section rel="cx:Section"><table><tr><td>data</td></tr></table></section>
+    <section rel="cx:Section"><div>Content<div>innerdiv</div></div></section>
+    <section rel="cx:Section"><p>Content<div>Div in 
paragraph</div></p></section>
+    <section rel="cx:Section"><ol><li>Item</li><li>Item</li></ol></section>
+    <link href="./Category:Oxygen#%20" id="mwCKQ" rel="mw:PageProp/Category" />
+    </body>`;
+
+describe( 'Section wrapping test', () => {
+       const parsedDoc = getParsedDoc( sourceHTML );
+       const result = normalize( parsedDoc.getHtml() );
+       const expectedResultData = normalize( expectedSectionWrappedHTML );
+       it( 'should not have any errors when section wrapping', () => {
+               assert.deepEqual( result, expectedResultData );
+       } );
+} );
diff --git a/test/segmentation/data/result-18.html 
b/test/segmentation/data/result-18.html
index de21b5d..ebdf783 100644
--- a/test/segmentation/data/result-18.html
+++ b/test/segmentation/data/result-18.html
@@ -1,3 +1 @@
-<div id="0"><p id="1"><span><span class="cx-segment" 
data-segmentid="2">Foo.<link/> </span><span class="cx-segment" 
data-segmentid="3">Bar.</span></span></p>
-<meta />
-<p id="4"><span class="cx-segment" data-segmentid="5">Baz.</span></p></div>
+<body id="0"><p id="1"><span><span class="cx-segment" 
data-segmentid="2">Foo.<link /> </span><span class="cx-segment" 
data-segmentid="3">Bar.</span></span></p><meta /><p id="4"><span 
class="cx-segment" data-segmentid="5">Baz.</span></p></body>
\ No newline at end of file
diff --git a/test/segmentation/data/result-4.html 
b/test/segmentation/data/result-4.html
index 494f294..0de936d 100644
--- a/test/segmentation/data/result-4.html
+++ b/test/segmentation/data/result-4.html
@@ -1,10 +1,10 @@
 <div id="0">
-       <div id="1">Some div</div>
-       <h1 id="2">
-               <span class="cx-segment" data-segmentid="3">Some heading</span>
+       <div id="1"><span class="cx-segment" data-segmentid="2">Some 
div</span></div>
+       <h1 id="3">
+               <span class="cx-segment" data-segmentid="4">Some heading</span>
        </h1>
-       <p id="4">
-               <span class="cx-segment" data-segmentid="5">This is first 
sentence. </span>
-               <span class="cx-segment" data-segmentid="6">This is second 
sentence</span>
+       <p id="5">
+               <span class="cx-segment" data-segmentid="6">This is first 
sentence. </span>
+               <span class="cx-segment" data-segmentid="7">This is second 
sentence</span>
        </p>
 </div>
diff --git a/test/segmentation/data/test-18.html 
b/test/segmentation/data/test-18.html
index a8f5591..7657f1d 100644
--- a/test/segmentation/data/test-18.html
+++ b/test/segmentation/data/test-18.html
@@ -1,5 +1,5 @@
-<div>
+<body>
 <p><span>Foo.<link/> Bar.</span></p>
 <meta/>
 <p>Baz.</p>
-</div>
+</body>

-- 
To view, visit https://gerrit.wikimedia.org/r/392790
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibb5937061c6980579d35cd24a0ba8205b109f8c7
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to